# Importação das bibliotecas

https://colab.research.google.com/drive/1tc6LiZJSn_YSzBMYuZePl8ataHzObtz-?usp=sharing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Importação dos dados

In [None]:
# tweet

tweets = pd.read_csv('Tweets/train_tweets.csv')
tweets_media = pd.read_csv('Tweets/train_tweets_vectorized_media.csv')
tweets_text = pd.read_csv('Tweets/train_tweets_vectorized_text.csv')

# user
user = pd.read_csv('Users/users.csv')

# teste
teste_tweets = pd.read_csv('Tweets/test_tweets.csv')
teste_tweets_media = pd.read_csv('Tweets/test_tweets_vectorized_media.csv')
teste_tweets_text = pd.read_csv('Tweets/test_tweets_vectorized_text.csv')

# Análise exploratória

Vemos que algumas das variáveis que consideramos interessantes de analisar, todas mantém boas proporções com relação à viralização do tweet.

In [None]:
print(tweets["virality"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["virality"].unique()),tweets["virality"].value_counts(), palette="deep")
plt.title("Número de Tweets por nível de viralização")
plt.xlabel("Viralização")
plt.ylabel("Frequencia");

In [None]:
print(tweets["tweet_attachment_class"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["tweet_attachment_class"].unique()),tweets["tweet_attachment_class"].value_counts(), palette = "deep")
plt.title("Número de Tweets por Attachment Class")
plt.xlabel("tweet_attachment_class")
plt.ylabel("Frequencia");

In [None]:
g = sns.FacetGrid(tweets, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "tweet_attachment_class");

In [None]:
print(tweets["tweet_mention_count"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["tweet_mention_count"].unique()),tweets["tweet_mention_count"].value_counts(), palette = "deep")
plt.title("Número de Tweets por Menções realizadas")
plt.xlabel("tweet_mention_count")
plt.ylabel("Frequencia");

In [None]:
g = sns.FacetGrid(tweets, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "tweet_mention_count");

In [None]:
print(tweets["tweet_url_count"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["tweet_url_count"].unique()), tweets.groupby(by=["tweet_url_count"])["virality"].describe()["count"], palette = "deep")
plt.title("Número de Tweets por URL's no texto")
plt.xlabel("tweet_url_count")
plt.ylabel("Frequencia");

In [None]:
g = sns.FacetGrid(tweets, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "tweet_url_count");

In [None]:
print(tweets["tweet_created_at_year"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["tweet_created_at_year"].unique()), tweets.groupby(by=["tweet_created_at_year"])["virality"].describe()["count"], palette = "deep")
plt.title("Número de Tweets por Ano")
plt.xlabel("Ano")
plt.ylabel("Frequencia");

In [None]:
g = sns.FacetGrid(tweets, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "tweet_created_at_year");

In [None]:
print(tweets["tweet_hashtag_count"].value_counts(),"\n")

plt.figure(figsize=(10,5))
sns.barplot(sorted(tweets["tweet_hashtag_count"].unique()), tweets.groupby(by=["tweet_hashtag_count"])["virality"].describe()["count"], palette = "deep")
plt.title("Número de Tweets por Quantidade de hashtag's")
plt.xlabel("Ano")
plt.ylabel("Frequencia");

In [None]:
g = sns.FacetGrid(tweets, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "tweet_hashtag_count");

In [None]:
corr = tweets.corr("pearson")

mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15,10))

# Generate a custom diverging colormap
cmap = sns.color_palette("viridis")

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

# Junção dos Dados

Tratamento das imagens. Iremos usar a informação da quantidade de imagens por tweet ao invés de tratar seu conteúdo.

In [None]:
tweets_media[tweets_media['tweet_id']==205]

No caso acima percebe-se que o tweet 205 possui 3 imagens, logo, é só agrupa-las e realizar este procedimento para todos os tweet_id's.

In [None]:
# Treino

n_images = tweets_media.groupby("tweet_id").size().reset_index()
n_images.columns = ["tweet_id", "num_images"]

# Adicionando zeros aos tweet_ids que não possuem imagem
for tweet in tweets.tweet_id.unique(): 
    if tweet not in n_images.tweet_id.values:
        df = pd.DataFrame({"tweet_id":tweet, "num_images":0}, index = ["0"])
        n_images = pd.concat([n_images, df], axis = 0, ignore_index = True)

In [None]:
# Teste

n_images_test = teste_tweets_media.groupby("tweet_id").size().reset_index()
n_images_test.columns = ["tweet_id", "num_images"]

# Adicionando zeros aos tweet_ids que não possuem imagem
for tweet in teste_tweets.tweet_id.unique(): 
    if tweet not in n_images_test.tweet_id.values:
        df = pd.DataFrame({"tweet_id":tweet, "num_images":0}, index = ["0"])
        n_images_test = pd.concat([n_images_test, df], axis = 0, ignore_index = True)

In [None]:
df = n_images.groupby(by=["num_images"])["tweet_id"].describe()
df

In [None]:
plt.figure(figsize=(10,5))

sns.barplot(df.index, df["count"], palette = "deep");
plt.title("Número de Tweets por Quantidade de imagens")
plt.xlabel("Número de Imagens")
plt.ylabel("Frequencia");

In [None]:
# Treino
## tweet
train = tweets.merge(n_images, on="tweet_id", how = 'left')
train = train.merge(tweets_text, on="tweet_id", how = 'left')
## user
train = train.merge(user,left_on='tweet_user_id',right_on='user_id', how='left')

In [None]:
g = sns.FacetGrid(train, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "num_images");

In [None]:
# Teste
## tweet
test = teste_tweets.merge(n_images_test, on="tweet_id", how = 'left')
test = test.merge(teste_tweets_text, on="tweet_id", how = 'left')
## user
test = test.merge(user,left_on='tweet_user_id',right_on='user_id', how='left')

In [None]:
g = sns.FacetGrid(test, col="virality", height=4, aspect=1, col_wrap=3, palette = "deep")
g.map(sns.histplot, "num_images");

Dando uma olhada geral nos nossos dados:

In [None]:
train.head()

Já que usaremos modelos e métodos que necessitam que os dados sejam númericos, iremos olhar mais profundamente as colunas não numéricas e tratá-las

In [None]:
train.select_dtypes(include='bool').head(5)

As colunas que tem dados booleanos, basta transformar para 0 ou 1:

In [None]:
train["tweet_has_attachment"] = train["tweet_has_attachment"].astype(int)
train["user_has_location"] = train["user_has_location"].astype(int)
train["user_has_url"] = train["user_has_url"].astype(int)

In [None]:
test["tweet_has_attachment"] = test["tweet_has_attachment"].astype(int)
test["user_has_location"] = test["user_has_location"].astype(int)
test["user_has_url"] = test["user_has_url"].astype(int)

Colunas Objeto:

In [None]:
train.select_dtypes(include='object').head(7)

Para a variável tweet_attachment_class, testamos o label encoder e o one hot encoding. Este ultimo melhorou o desempenho dos modelos no geral.

In [None]:
df = pd.get_dummies(train['tweet_attachment_class'])
train = pd.concat([train, df], axis = 1)

In [None]:
df = pd.get_dummies(test['tweet_attachment_class'])
test = pd.concat([test, df], axis = 1)

No caso do "tweet_topic_ids", após alguns testes percebemos que o modelo fica melhor se retirarmos essa variável

In [None]:
# Após os tratamentos temos

print(f"Número de colunas no conjunto de treino: {len(train.columns)}")
print(f"Número de colunas no conjunto de teste: {len(test.columns)}")

# Modelagem

In [None]:
X = train.drop(["virality","tweet_id", "tweet_user_id", "tweet_topic_ids", 'tweet_attachment_class'], axis=1)
y = train["virality"]

In [None]:
test = test.drop(["tweet_id", "tweet_user_id", "tweet_topic_ids", 'tweet_attachment_class'], axis = 1)

In [None]:
# PCA 1
pca1 = PCA(n_components=50)
pca1.fit(X)
X1 = pca1.transform(X)

In [None]:
# PCA 2
pca2 = PCA(n_components=75)
pca2.fit(X)
X2 = pca2.transform(X)

In [None]:
# PCA 3
pca3 = PCA(n_components=100)
pca3.fit(X)
X3 = pca3.transform(X)

In [None]:
# PCA 4
pca4 = PCA(n_components=125)
pca4.fit(X)
X4 = pca4.transform(X)

Testamos com as 4 quantidades de componentes e a que se saiu melhor foi o PCA com 75 componentes

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X2, y, test_size = 0.25, random_state=37)

In [None]:
rfc = RandomForestClassifier(random_state = 37, max_depth=10, bootstrap=False)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
bag = BaggingClassifier(random_state=37, n_estimators= 100, max_samples= 14812, max_features= 37, bootstrap_features= True, bootstrap= True, n_jobs = -1,
                       base_estimator = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan', 
                                                             metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform'))
bag.fit(X_train,y_train)
y_pred = bag.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
reg = xgb.XGBClassifier()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
etr = ExtraTreesClassifier(random_state= 37, max_features=None, max_depth=13, criterion= 'gini', n_estimators= 200)
etr.fit(X_train, y_train)
y_pred = etr.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
cat = CatBoostClassifier()
cat.fit(X_train, y_train)
y_pred = cat.predict(X_val)

In [None]:
print(accuracy_score(y_val, y_pred))

O melhor modelo foi o BaggingClassifier

Vamos tunar o modelo para obter o melhor desempenho

In [None]:
n_samples = X2.shape[0]
n_features = X2.shape[1]

bag = BaggingClassifier(random_state=37, n_jobs = -1)

params = {'base_estimator': [None, LogisticRegression(random_state=37, n_jobs = -1), KNeighborsClassifier(n_jobs = -1)],
          'n_estimators': [20,50,100],
          'max_samples': [0.5, 1.0, n_samples//2, ],
          'max_features': [0.5, 1.0, n_features//2, ],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

bagging_classifier_grid = RandomizedSearchCV(bag, param_distributions = params, cv = 3, n_jobs = -1, verbose = 1, random_state=37, n_iter=25)
bagging_classifier_grid.fit(X_train, y_train)

print('Train Accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_val, y_val))
print('Best Accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)

In [None]:
model = BaggingClassifier(random_state=37, n_estimators= 50, max_samples= 14812, max_features = 0.5, bootstrap_features= True, bootstrap= True, n_jobs=-1,
                       base_estimator = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan', 
                                                             metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform'))

media = round(np.mean(cross_val_score(model, X2, y, scoring='accuracy', cv=StratifiedKFold(n_splits=5, random_state=37))),3)
erro = round(np.std(cross_val_score(model, X2, y, scoring='accuracy', cv=StratifiedKFold(n_splits=5, random_state=37))),3)
print('Bagging :  '+str(media)+' +- '+str(erro))

# Criação do arquivo de submissão

In [None]:
X_test = pca2.transform(test)

In [None]:
model.fit(X2, y)
y_pred = model.predict(X_test)
submission = pd.DataFrame(data = {"tweet_id": test["tweet_id"], 
                                  "virality": y_pred})
submission.to_csv("solution_format.csv",index=False)