# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
df_tweets = pd.read_csv('train_tweets.csv').sort_values('tweet_id')
df_text = pd.read_csv('train_tweets_vectorized_text.csv')
df_media = pd.read_csv('train_tweets_vectorized_media.csv')

Como não será possível fazer a submissão para teste, o dataset de treino foi subdividido para ser usado como teste. Foram selecionados também, dos datasets de texto e de mídia, os registros dos tweets correspondentes aos tweets de treino e de teste.

In [None]:
df_train_tweets, df_test_tweets = train_test_split(df_tweets, test_size=0.2, random_state=0)

train_tweets_ids = df_train_tweets['tweet_id']
test_tweets_ids = df_test_tweets['tweet_id']

df_train_text = df_text[df_text['tweet_id'].isin(train_tweets_ids)]
df_train_media = df_media[df_media['tweet_id'].isin(train_tweets_ids)]

df_test_text = df_text[df_text['tweet_id'].isin(test_tweets_ids)]
df_test_media = df_media[df_media['tweet_id'].isin(test_tweets_ids)]

In [None]:
df_users = pd.read_csv('users.csv')
df_users_images = pd.read_csv('user_vectorized_profile_images.csv')
df_users_descriptions = pd.read_csv('user_vectorized_descriptions.csv')

# Pré-processamento

## Dados de texto

Passo a passo do pré-processamento dos dados de texto

**Colocar coluna de ID como índice(temporariamente)** </br>
Isso foi feito para elas não serem afetadas na manipulação e cálculos

In [None]:
df_train_text_features = df_train_text.set_index('tweet_id')
df_test_text_features = df_test_text.set_index('tweet_id')

**Normalização das features de texto**

In [None]:
text_scaler = StandardScaler().fit(df_train_text_features)
df_train_text_features_scaled = text_scaler.transform(df_train_text_features)
df_test_text_features_scaled = text_scaler.transform(df_test_text_features)

**Decomposição em componentes principais**

Foram utilizadas 50 componentes principais, que conseguem explicar cerca de 70% da variância dos dados

In [None]:
pca_text = PCA()
pca_text.fit(df_train_text_features_scaled)

limit = 200
n_pc_text = 50

plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot(pca_text.explained_variance_ratio_[:limit])
plt.title('Variance explained by each principal component')
plt.vlines(x=n_pc_text, ymin=0, ymax=pca_text.explained_variance_ratio_[0])

plt.subplot(122)
plt.plot(np.cumsum(pca_text.explained_variance_ratio_[:limit]))
#plt.hlines(y=0.7, xmin=0, xmax=limit)
plt.vlines(x=n_pc_text, ymin=0, ymax=1)
plt.title('Variance explained by each principal component')

plt.show()

Após o cálculo das componentes principais, foi recolocado o id do tweet como coluna

In [None]:
pca_text = PCA(n_components=n_pc_text).fit(df_train_text_features_scaled)
train_PC_text = pca_text.transform(df_train_text_features_scaled)
test_PC_text = pca_text.transform(df_test_text_features_scaled)
 
df_train_text_pc = pd.DataFrame(train_PC_text, columns=['PC{}_text'.format(i+1) for i in range(n_pc_text)], index = df_train_text_features.index)
df_train_text_pc = df_train_text_pc.reset_index()
df_train_text_pc.head()

In [None]:
df_test_text_pc = pd.DataFrame(test_PC_text, columns=['PC{}_text'.format(i+1) for i in range(n_pc_text)], index = df_test_text_features.index)
df_test_text_pc = df_test_text_pc.reset_index()
df_test_text_pc.head()

In [None]:
df_train_text_pc.isna().sum().sort_values(ascending=False)[:5] + df_test_text_pc.isna().sum().sort_values(ascending=False)[:5]

## Dados de mídia

In [None]:
df_train_media.head()

**Número de mídias por tweet** </br>
Foi calculado o número de mídias por tweet

In [None]:
n_media_train = df_train_media.groupby('tweet_id').size().reset_index().rename(columns={0:'n_media'})
n_media_test = df_test_media.groupby('tweet_id').size().reset_index().rename(columns={0:'n_media'})

**Média entre as mídias** </br>
Para tweets com mais de uma mídia, foi feita a média entre as features de todas as mídias

In [None]:
df_train_media_by_tweet = df_train_media.groupby('tweet_id').agg('mean')
df_train_media_by_tweet = df_train_media_by_tweet.reset_index()
df_train_media_by_tweet.head()

In [None]:
df_test_media_by_tweet = df_test_media.groupby('tweet_id').agg('mean')
df_test_media_by_tweet = df_test_media_by_tweet.reset_index()
df_test_media_by_tweet.head()

**Remover coluna de ID (temporariamente)** </br>
A coluna foi removida para não ser alterada pelos cálculos

In [None]:
df_train_media_features = df_train_media_by_tweet.drop(columns=['tweet_id'])
df_test_media_features = df_test_media_by_tweet.drop(columns=['tweet_id'])

**Normalização**

In [None]:
media_scaler = StandardScaler().fit(df_train_media_features)
df_train_media_features_scaled = media_scaler.transform(df_train_media_features)
df_test_media_features_scaled = media_scaler.transform(df_test_media_features)

**Componentes principais** </br>
Foram utilizados apenas 20 componentes principais. Apesar de elas explicarem apenas cerca de 30% da variância dos dados, optei por mantê-las para verificar se elas teriam relevância no modelo final. Para chegar a uma quantidade melhor de explicatibilidade, seria necessário adicionar muitas outras componentes, o que achei que não compensaria.

In [None]:
pca_media = PCA()
pca_media.fit(df_train_media_features_scaled)

limit = 100
n_pc_media = 20

plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot(pca_media.explained_variance_ratio_[:limit])
plt.title('Variance explained by each principal component')
plt.vlines(x=n_pc_media, ymin=0, ymax=pca_media.explained_variance_ratio_[0])

plt.subplot(122)
plt.plot(np.cumsum(pca_media.explained_variance_ratio_[:limit]))
#plt.hlines(y=0.7, xmin=0, xmax=limit)
plt.vlines(x=n_pc_media, ymin=0, ymax=1)
plt.title('Variance explained by n principal components')

plt.show()

In [None]:
pca_media = PCA(n_components=n_pc_media).fit(df_train_media_features_scaled)
train_PC_media = pca_media.transform(df_train_media_features_scaled)
test_PC_media = pca_media.transform(df_test_media_features_scaled)

Após o cálculo das componentes principais, foi recolocado o id do tweet como coluna

In [None]:
df_train_media_pc = pd.DataFrame(train_PC_media, columns=['PC{}_media'.format(i+1) for i in range(n_pc_media)])
df_train_media_pc['tweet_id'] = df_train_media_by_tweet['tweet_id']
df_train_media_pc.head()

In [None]:
df_test_media_pc = pd.DataFrame(test_PC_media, columns=['PC{}_media'.format(i+1) for i in range(n_pc_media)])
df_test_media_pc['tweet_id'] = df_test_media_by_tweet['tweet_id']
df_test_media_pc.head()

E também o número de mídias do tweet foi adicionado fazendo o merge com o dataset criado anteriormente com esses dados

In [None]:
df_train_media_pc = pd.merge(df_train_media_pc, n_media_train, on=['tweet_id'], how='left')
df_train_media_pc.head()

In [None]:
df_test_media_pc = pd.merge(df_test_media_pc, n_media_test, on=['tweet_id'], how='left')
df_test_media_pc.head()

In [None]:
df_train_media_pc.isna().sum() + df_test_media_pc.isna().sum()

## Dados dos Tweets

In [None]:
df_train_tweets.head()

Foi verificado se de fato havia somente um texto por tweet

In [None]:
max(df_train_tweets.groupby('tweet_id').size().sort_values())

In [None]:
max(df_test_tweets.groupby('tweet_id').size().sort_values())

### Tratamento de attachment

A coluna *'tweet_has_attachment'* foi transformada para numérica (False = 0, True = 1) e a a coluna *'tweet_attachment_class'* foi mapeada para 3 colunas com seus possível valores, fazendo um one-hot encode

In [None]:
df_train_tweets['tweet_has_attachment'] = df_train_tweets['tweet_has_attachment'].replace({False: 0, True: 1})
if ('tweet_attachment_class' in df_train_tweets.columns):
  df_train_tweets = df_train_tweets.join(pd.get_dummies(df_train_tweets['tweet_attachment_class'], prefix='attachment'))
  df_train_tweets = df_train_tweets.drop(columns='tweet_attachment_class')
df_train_tweets.sort_values('virality', ascending=False).head()

In [None]:
df_test_tweets['tweet_has_attachment'] = df_test_tweets['tweet_has_attachment'].replace({False: 0, True: 1})
if ('tweet_attachment_class' in df_test_tweets.columns):
  df_test_tweets = df_test_tweets.join(pd.get_dummies(df_test_tweets['tweet_attachment_class'], prefix='attachment'))
  df_test_tweets = df_test_tweets.drop(columns='tweet_attachment_class')
df_test_tweets.head()

### Tratamento de tópicos



In [None]:
df_train_tweets.isna().sum()

In [None]:
df_train_tweets_topics = df_train_tweets.copy()
df_test_tweets_topics = df_test_tweets.copy()

Os valores nulos foram tratados adicionando um vetor vazio ao campo de tópicos

In [None]:
df_train_tweets_topics.loc[df_train_tweets_topics['tweet_topic_ids'].isnull(), 'tweet_topic_ids'] = df_train_tweets_topics.loc[df_train_tweets_topics['tweet_topic_ids'].isnull(), 'tweet_topic_ids'].apply(lambda x: '[]')
df_train_tweets_topics.sort_values('virality', ascending=False).head()

In [None]:
df_test_tweets_topics.loc[df_test_tweets_topics['tweet_topic_ids'].isnull(), 'tweet_topic_ids'] = df_test_tweets_topics.loc[df_test_tweets_topics['tweet_topic_ids'].isnull(), 'tweet_topic_ids'].apply(lambda x: '[]')
df_test_tweets_topics.sort_values('virality', ascending=False).head()

In [None]:
df_train_tweets_topics.isna().sum() + df_test_tweets_topics.isna().sum()

Foi necessário tratar o campo de tópicos para ser tratado como um vetor de números e também foi calculado o número de tópicos de cada tweet

In [None]:
def topics_treat(topics):
  if (topics=='[]'):
    return []

  t = topics.split(',')
  new_t = []

  for i in t:
    i = i.replace('[', '')
    i = i.replace(']', '')
    i = i.replace(' ', '')
    i = i.replace("'", '')
    new_t.append(i)

  return new_t

df_train_tweets_topics['tweet_topic_ids'] = df_train_tweets_topics['tweet_topic_ids'].apply(topics_treat)
df_test_tweets_topics['tweet_topic_ids'] = df_test_tweets_topics['tweet_topic_ids'].apply(topics_treat)
df_train_tweets_topics.sort_values('virality', ascending=False).head()

In [None]:
df_train_tweets_topics['n_topics'] = df_train_tweets_topics['tweet_topic_ids'].apply(lambda x: len(x))
df_test_tweets_topics['n_topics'] = df_test_tweets_topics['tweet_topic_ids'].apply(lambda x: len(x))
df_train_tweets_topics.sort_values('virality', ascending=False).head()

Os tópicos foram mapeados para colunas próprias, fazendo um one-hot, mas com uma matriz esparsa, para economizar RAM

In [None]:
mlb = MultiLabelBinarizer(sparse_output=True)
df_train_tweets_mlb = df_train_tweets_topics.join(
                                  pd.DataFrame.sparse.from_spmatrix(
                                      mlb.fit_transform(df_train_tweets_topics.pop('tweet_topic_ids')),
                                      index=df_train_tweets_topics.index,
                                      columns=['topic_{}'.format(topic) for topic in mlb.classes_])
                                  )
df_train_tweets_mlb.sort_values('virality', ascending=False).head()

In [None]:
mlb = MultiLabelBinarizer(sparse_output=True)
df_test_tweets_mlb = df_test_tweets_topics.join(
                                  pd.DataFrame.sparse.from_spmatrix(
                                      mlb.fit_transform(df_test_tweets_topics.pop('tweet_topic_ids')),
                                      index=df_test_tweets_topics.index,
                                      columns=['topic_{}'.format(topic) for topic in mlb.classes_])
                                  )
df_test_tweets_mlb.sort_values('virality', ascending=False).head()

As colunas da base de teste foram organizadas para corresponderem as mesma da base de treino, removendo os tópicos que não estavam na base de treino, e adicionando com o valor 0 os tópicos que estavam na base de treino mas não na de teste.

In [None]:
topics_only_train = set(df_train_tweets_mlb.columns) - set(df_test_tweets_mlb.columns)
topics_only_test = set(df_test_tweets_mlb.columns) - set(df_train_tweets_mlb.columns)

for topic in topics_only_train:
  df_test_tweets_mlb[topic] = 0

df_test_tweets_mlb = df_test_tweets_mlb.drop(columns=topics_only_test)
df_test_tweets_mlb = df_test_tweets_mlb[df_train_tweets_mlb.columns]
df_test_tweets_mlb.head()

In [None]:
df_train_tweets_mlb.sort_values('virality', ascending=False).head()

In [None]:
df_train_tweets_features = df_train_tweets_mlb.set_index(['tweet_id', 'tweet_user_id'])
df_test_tweets_features = df_test_tweets_mlb.set_index(['tweet_id', 'tweet_user_id'])

df_train_tweets_features.head()

In [None]:
df_train_tweets_features.shape

In [None]:
df_train_tweets_features.reset_index(inplace=True)
df_train_tweets_features.head()

In [None]:
df_test_tweets_features.reset_index(inplace=True)
df_test_tweets_features.head()

In [None]:
(df_train_tweets_features.isna().sum() + df_test_tweets_features.isna().sum()).sort_values(ascending=False)

## Dados de usuários

### User info

In [None]:
df_users.head()

As colunas boolenas foram mapeadas para valores numéricos

In [None]:
df_users[['user_has_url', 'user_has_location']] = df_users[['user_has_url', 'user_has_location']].replace({False: 0, True: 1})
df_users.head()

In [None]:
df_users.isna().sum()

### User descriptions

In [None]:
df_users_descriptions.head()

In [None]:
df_users_descriptions_features = df_users_descriptions.drop(columns='user_id')

**Normalização**

In [None]:
descriptions_scaler = StandardScaler().fit(df_users_descriptions_features)
df_users_descriptions_features_scaled = descriptions_scaler.transform(df_users_descriptions_features)

**Componentes principais** </br>
Foram utilizadas 15 componentes principais, que descrevem cerca de 70% da variância dos dados

In [None]:
pca_descriptions = PCA()
pca_descriptions.fit(df_users_descriptions_features_scaled)

limit = 200
n_pc_descriptions = 15

plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot(pca_descriptions.explained_variance_ratio_[:limit])
plt.title('Variance explained by each principal component')
plt.vlines(x=n_pc_descriptions, ymin=0, ymax=pca_descriptions.explained_variance_ratio_[0])

plt.subplot(122)
plt.plot(np.cumsum(pca_descriptions.explained_variance_ratio_[:limit]))
#plt.hlines(y=0.7, xmin=0, xmax=limit)
plt.vlines(x=n_pc_descriptions, ymin=0, ymax=1)
plt.title('Variance explained by each principal component')

plt.show()

In [None]:
pca_descriptions = PCA(n_components=n_pc_descriptions)
PC_descriptions = pca_descriptions.fit_transform(df_users_descriptions_features_scaled)
 
df_users_descriptions_pc = pd.DataFrame(PC_descriptions, columns=['PC{}_description'.format(i+1) for i in range(n_pc_descriptions)])
df_users_descriptions_pc['user_id'] = df_users_descriptions['user_id']
df_users_descriptions_pc.head()

In [None]:
df_users_descriptions_pc.isna().sum()

### User image

In [None]:
df_users_images.head()

In [None]:
df_users_images_features = df_users_images.drop(columns=['user_id'])

In [None]:
df_users_images_features.shape

**Normalização**

In [None]:
images_scaler = StandardScaler().fit(df_users_images_features)
df_users_images_features_scaled = images_scaler.transform(df_users_images_features)

**Componentes principais** </br>
Foram utilizadas 15 componentes principais, que descrevem cerca de 70% da variância dos dados

In [None]:
pca_images = PCA()
pca_images.fit(df_users_images_features_scaled)

limit = 200
n_pc_images = 15

plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot(pca_images.explained_variance_ratio_[:limit])
plt.title('Variance explained by each principal component')
plt.vlines(x=n_pc_images, ymin=0, ymax=pca_images.explained_variance_ratio_[0])

plt.subplot(122)
plt.plot(np.cumsum(pca_images.explained_variance_ratio_[:limit]))
#plt.hlines(y=0.7, xmin=0, xmax=limit)
plt.vlines(x=n_pc_images, ymin=0, ymax=1)
plt.title('Variance explained by each principal component')

plt.show()

In [None]:
pca_images = PCA(n_components=n_pc_images)
PC_images = pca_images.fit_transform(df_users_images_features_scaled)
 
df_users_images_pc = pd.DataFrame(PC_images, columns=['PC{}_images'.format(i+1) for i in range(n_pc_images)])
df_users_images_pc['user_id'] = df_users_images['user_id']
df_users_images_pc.head()

In [None]:
df_users_images_pc.isna().sum()

## Junção dos dados

Os dados foram unidos em um só DataFrame fazendo o merge com as colunas *'tweet_id'* (para os dados de mídia e texto) e *'user_id'*/*'tweet_user_id'* (para os dados de usuário)

In [None]:
df_train = pd.merge(df_train_tweets_features, df_train_text_pc, on=['tweet_id'], how='left')
df_train = pd.merge(df_train, df_train_media_pc, on=['tweet_id'], how='left')
df_train = pd.merge(df_train, df_users_descriptions_pc, left_on=['tweet_user_id'], right_on=['user_id'], how='left')
df_train = pd.merge(df_train, df_users_images_pc, on=['user_id'], how='left')
df_train = pd.merge(df_train, df_users, on=['user_id'], how='left')

In [None]:
df_train['n_media'] = df_train['n_media'].replace(np.nan, 0.0)
#df_train = df_train.drop(columns=['PC{}_media'.format(i+1) for i in range(n_pc_media)])
df_train.sort_values('virality', ascending=False).head()

In [None]:
df_train.shape

In [None]:
df_train.isna().sum().sort_values(ascending=False)[:10]

In [None]:
df_test = pd.merge(df_test_tweets_features, df_test_text_pc, on=['tweet_id'], how='left')
df_test = pd.merge(df_test, df_test_media_pc, on=['tweet_id'], how='left')
df_test = pd.merge(df_test, df_users_descriptions_pc, left_on=['tweet_user_id'], right_on=['user_id'], how='left')
df_test = pd.merge(df_test, df_users_images_pc, on=['user_id'], how='left')
df_test = pd.merge(df_test, df_users, on=['user_id'], how='left')

In [None]:
df_test['n_media'] = df_test['n_media'].replace(np.nan, 0.0)
#df_test = df_test.drop(columns=['PC{}_media'.format(i+1) for i in range(n_pc_media)])
df_test.head()

In [None]:
df_test.isna().sum().sort_values(ascending=False)[:10]

Os dados foram separados nas features (X) e target (y)

In [None]:
target = 'virality'
not_features = [target, 'tweet_id', 'user_id', 'tweet_user_id']

X_train = df_train[[column for column in df_train.columns if column not in not_features]]
y_train = df_train[target].apply(lambda x: int(x))

X_test = df_test[[column for column in df_test.columns if column not in not_features]]
y_test = df_test[target].apply(lambda x: int(x))

Os valores nulos nas colunas de mídia (nos tweets sem mídia) foram resolvidos colocando a média da coluna

In [None]:
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

### Normalization

In [None]:
final_scaler = StandardScaler().fit(X_train)
X_train_scaled = final_scaler.transform(X_train)
X_test_scaled = final_scaler.transform(X_test)

### PCA

Foi estudada a aplicação de PCA, mas pelos resultados obtidos, optou-se por manter as variáveis originais e fazer uma seleção de feature

In [None]:
pca_final = PCA()
pca_final.fit(X_train_scaled)

limit = 200
n_pc_final = 50

plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot(pca_final.explained_variance_ratio_[:limit])
plt.title('Variance explained by each principal component')
plt.vlines(x=n_pc_final, ymin=0, ymax=pca_final.explained_variance_ratio_[0])

plt.subplot(122)
plt.plot(np.cumsum(pca_final.explained_variance_ratio_[:limit]))
#plt.hlines(y=0.7, xmin=0, xmax=limit)
plt.vlines(x=n_pc_final, ymin=0, ymax=1)
plt.title('Variance explained by each principal component')

plt.show()

# Modelos

In [None]:
!pip install catboost

In [None]:
!pip install xgboost

In [None]:
!pip install lightgbm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, f_regression, mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV

## Feature selection

Seleção de feature usando duas funções diferentes: f_classif e mutual_info_classif. A barra vertical do gráfico separa, à esquerda, as features que foram selecionadas

In [None]:
k = 30
f_selector_fc = SelectKBest(score_func=f_classif, k=k)
f_selector_fc.fit(X_train_scaled, y_train)

In [None]:
g_aux = pd.DataFrame()
g_aux['feature'] = X_train.columns
g_aux['score'] = f_selector_fc.scores_
g_aux = g_aux.sort_values(by='score', ascending=False)

plt.figure(figsize=(30, 6))

plt.bar([i for i in range(len(f_selector_fc.scores_))], g_aux['score'])
plt.xticks(ticks=range(len(X_train.columns)),labels=g_aux['feature'], rotation=90)
plt.xlabel("feature index")
plt.ylabel("Estimated MI value")
plt.vlines(x=k-0.5, ymin=0, ymax=max(f_selector_fc.scores_))
plt.show()

In [None]:
k = 35

f_selector_mic = SelectKBest(score_func=mutual_info_classif, k=k)
f_selector_mic.fit(X_train_scaled, y_train)

In [None]:
g_aux = pd.DataFrame()
g_aux['feature'] = X_train.columns
g_aux['score'] = f_selector_mic.scores_
g_aux = g_aux.sort_values(by='score', ascending=False)

plt.figure(figsize=(30, 6))

plt.bar([i for i in range(len(f_selector_mic.scores_))], g_aux['score'])
plt.xticks(ticks=range(len(X_train.columns)),labels=g_aux['feature'], rotation=90)
plt.xlabel("feature index")
plt.ylabel("Estimated MI value")
plt.vlines(x=k-0.5, ymin=0, ymax=max(f_selector_mic.scores_))
plt.show()

In [None]:
X_train_f1 = f_selector_fc.transform(X_train_scaled)
X_train_f2 = f_selector_mic.transform(X_train_scaled)

X_test_f1 = f_selector_fc.transform(X_test_scaled)
X_test_f2 = f_selector_mic.transform(X_test_scaled)

## Validação dos modelos

Foram feitos experimentos com alguns modelos, e os que obtiveram os melhores resultados foram o SVC e o LGBM, com o primeiro conjunto de features.

In [None]:
def validate_models_clf(X, y):
  models = {'SVC': SVC(),
            'LGBM': LGBMClassifier(),
            'Naive Bayes': GaussianNB(),
            'Ada Boost': AdaBoostClassifier()}
  
  print('')

  for model_name, model in models.items():
    score = cross_val_score(model, X, y, scoring='accuracy').mean()
    print(model_name, ': ', np.round(score, 4))

In [None]:
validate_models_clf(X_train_f1, y_train)

In [None]:
validate_models_clf(X_train_f2, y_train)

## Ajuste de hiperparâmetros

Foi feita uma busca randomizada para otimizar os hiperparâmetros, mas não foram obtidos bons resultados e optou-se por manter o LGBM com os hiperparâmetros default

In [None]:
lgbm_params = {  'n_estimators': [int(x) for x in np.linspace(10, 300, 10)],
            'max_depth': [int(x) for x in np.linspace(10, 100, 10)] + [None],
            'min_child_samples': [int(x) for x in np.linspace(1, 50, 10)],
            'learning_rate': [float(x) for x in np.linspace(0.01, 10, 10)]
}

In [None]:
lgbm_search = RandomizedSearchCV(LGBMClassifier(jobs=-1), lgbm_params, scoring='accuracy', n_iter=100)
lgbm_results = search.fit(X_train_f1, y_train)

In [None]:
lgbm_results.best_score_

In [None]:
svc_params = {'C': [0.01, 0.05, 0.1, 0.5, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'gamma':['scale', 'auto']}

In [None]:
svc_search = RandomizedSearchCV(SVC(), svc_params, scoring='accuracy')
svc_results = svc_search.fit(X_train_f1, y_train)

In [None]:
svc_results.best_score_

## Testando o modelo

Teste do modelo nos dados que foram separados inicialmente, obtendo cerca de 63.5% de acurácia

In [None]:
clf = LGBMClassifier().fit(X_train_f1, y_train)

In [None]:
y_pred_clf = clf.predict(X_test_f1)
y_pred_clf

In [None]:
accuracy_score(y_test, y_pred_clf)