In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/train.csv", index_col=0)

In [3]:
df.sample(5)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
21983,0,nove :) #flylondon #friends #sunglasses #love
18279,0,battlefield 1 looks unbelievable ð
5198,1,this is beyond upsetting. i was one of her big...
31931,1,@user #feminismiscancer #feminismisterrorism #...
4775,0,"@user doing that thing where you find an old,..."


In [4]:
df["label"].unique()

array([0, 1])

In [5]:
df["tweet"].nunique()

29530

In [6]:
df.size

63924

In [7]:
df["tweet"].value_counts()

#model   i love u take with u all the time in urð±!!! ðððð
ð¦ð¦ð¦                                                            319
i finally found a way how to delete old tweets! you might find it useful as well:    #deletetweets                                               82
aww yeah it's all good bing bong bing bong                                                                                                       75
i'm so   and #grateful now that - #affirmations                                                                                                  56
@user you might be a libtard if... #libtard  #sjw #liberal #politics                                                                             40
you might be a libtard if... #libtard  #sjw #liberal #politics                                                                                   32
ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme   #followâ¦                  

In [8]:
import nltk

In [9]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def tweets_lemm(tweet):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(tweet)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

In [11]:
df['tweet'] = df['tweet'].apply(lambda x : tweets_lemm(x))

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X = df["tweet"]
y = df["label"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
stops = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/rcrespillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
vectorizer = TfidfVectorizer(analyzer='word', 
                             tokenizer=nltk.word_tokenize, 
                             stop_words=stops, 
                             ngram_range=(1,3))

X_train_v = vectorizer.fit_transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [18]:
X_train_v.shape

(25569, 390590)

In [19]:
vectorizer.vocabulary_

{'â\x86\x9d': 380764,
 '#': 5900,
 'united': 345109,
 'state': 317372,
 '30-year': 78484,
 'bond': 119228,
 'auction': 105391,
 '2.475': 76672,
 '%': 48717,
 'previous': 281660,
 '2.615': 76686,
 'blog': 118072,
 'silver': 308364,
 'gold': 188375,
 'forex': 176160,
 'â\x86\x9d #': 380765,
 '# united': 44919,
 'united state': 345125,
 'state 30-year': 317401,
 '30-year bond': 78485,
 'bond auction': 119237,
 'auction 2.475': 105394,
 '2.475 %': 76673,
 '% previous': 48840,
 'previous 2.615': 281693,
 '2.615 %': 76687,
 '% #': 48718,
 '# blog': 10925,
 'blog #': 118077,
 '# silver': 39564,
 'silver #': 308365,
 '# gold': 21472,
 'gold #': 188379,
 '# forex': 19774,
 'â\x86\x9d # united': 380863,
 '# united state': 44922,
 'united state 30-year': 345129,
 'state 30-year bond': 317402,
 '30-year bond auction': 78486,
 'bond auction 2.475': 119238,
 'auction 2.475 %': 105395,
 '2.475 % previous': 76674,
 '% previous 2.615': 48843,
 'previous 2.615 %': 281694,
 '2.615 % #': 76688,
 '% # blog

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {
    'hidden_layer_sizes': [(10,)],
    'activation': ['tanh']
}

mlpc = MLPClassifier()
grid_search = GridSearchCV(mlpc,
                           param_grid,
                           cv=5,
                           scoring='roc_auc',
                           verbose=20,
                           refit=True,
                           n_jobs=-1)

In [22]:
grid_search.fit(X_train_v, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] activation=tanh, hidden_layer_sizes=(10,) .......................
[CV] activation=tanh, hidden_layer_sizes=(10,) .......................
[CV] activation=tanh, hidden_layer_sizes=(10,) .......................
[CV] activation=tanh, hidden_layer_sizes=(10,) .......................
[CV]  activation=tanh, hidden_layer_sizes=(10,), score=0.9657378686560694, total=16.1min
[CV] activation=tanh, hidden_layer_sizes=(10,) .......................


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 16.1min


[CV]  activation=tanh, hidden_layer_sizes=(10,), score=0.9644016414330027, total=16.2min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 16.2min remaining: 24.3min


[CV]  activation=tanh, hidden_layer_sizes=(10,), score=0.9636810980378373, total=16.2min


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 16.2min remaining: 10.8min


[CV]  activation=tanh, hidden_layer_sizes=(10,), score=0.9616489313404157, total=16.3min
[CV]  activation=tanh, hidden_layer_sizes=(10,), score=0.9678115949691298, total= 4.6min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 20.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 20.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(10,)], 'activation': ['tanh']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=20)

In [23]:
grid_search.best_params_

{'activation': 'tanh', 'hidden_layer_sizes': (10,)}

In [24]:
mlp_clf = grid_search.best_estimator_

In [25]:
#mlp_clf.fit(X_train_v, y_train)

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [27]:
y_pred = mlp_clf.predict(X_test_v)

In [28]:
#confusion_matrix(y_test, y_pred)

In [29]:
print("roc_auc score: {}".format(roc_auc_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

roc_auc score: 0.7726805613961313
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5945
          1       0.96      0.55      0.70       448

avg / total       0.97      0.97      0.96      6393



In [30]:
X_all = vectorizer.transform(X)
y_all = y

In [31]:
mlp_clf.fit(X_all, y_all)

MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [32]:
df_test = pd.read_csv("dataset/test.csv")

In [33]:
df_test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [34]:
df_test['tweet'] = df_test['tweet'].apply(lambda x : tweets_lemm(x))

In [35]:
df_test.head()

Unnamed: 0,id,tweet
0,31963,# studiolife # aislife # requires # passion # ...
1,31964,@ user # white # supremacist want everyone to ...
2,31965,safe way to heal your # acne ! ! # altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd # bihday to my amazing , hilarious # nephe..."


In [36]:
dtest = vectorizer.transform(df_test['tweet'])
test_pred = mlp_clf.predict(dtest)

In [37]:
df_test['label'] = test_pred

In [38]:
submission = df_test[['id','label']]

In [39]:
submission.to_csv('sub_rodixxi_mlpC_GS_lemma.csv', index=False)