In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/train.csv", index_col=0)

In [3]:
df.sample(5)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
19449,0,so looking forward to my spa day tomorrow!!! @...
9685,0,"a very #bihday to the #gorgeous, sonam kapoo..."
2694,0,tomorrow it's been 30 years you died... i miss...
23624,0,the waiting is almost over...#pratice 1 #europ...
30347,0,"_animaladvocate: the ""#dairy industry"" destroy..."


In [4]:
df["label"].unique()

array([0, 1])

In [5]:
df["tweet"].nunique()

29530

In [6]:
df.size

63924

In [7]:
df["tweet"].value_counts()

#model   i love u take with u all the time in urð±!!! ðððð
ð¦ð¦ð¦                                                        319
i finally found a way how to delete old tweets! you might find it useful as well:    #deletetweets                                           82
aww yeah it's all good bing bong bing bong                                                                                                   75
i'm so   and #grateful now that - #affirmations                                                                                              56
@user you might be a libtard if... #libtard  #sjw #liberal #politics                                                                         40
you might be a libtard if... #libtard  #sjw #liberal #politics                                                                               32
ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme   #followâ¦                                          

In [8]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def tweets_lemm(tweet):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(tweet)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

In [10]:
df['tweet'] = df['tweet'].apply(lambda x : tweets_lemm(x))

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df["tweet"]
y = df["label"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [15]:
nltk.download('punkt')
nltk.download('stopwords')
stops = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/rcrespillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
vectorizer = TfidfVectorizer(analyzer='word', 
                             tokenizer=nltk.word_tokenize, 
                             stop_words=stops, 
                             ngram_range=(1,3))

X_train_v = vectorizer.fit_transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [17]:
X_train_v.shape

(25569, 391320)

In [18]:
vectorizer.vocabulary_

{'@': 86913,
 'user': 347344,
 "'ll": 49806,
 "'treading": 54880,
 'board': 118912,
 "'": 49029,
 'famous': 167625,
 'theatre': 330962,
 'le': 223879,
 'month': 248110,
 '!': 0,
 'ticket': 333899,
 ':': 81455,
 'httâ\x80¦': 207038,
 '@ user': 87675,
 "user 'll": 348587,
 "'ll 'treading": 49817,
 "'treading board": 54881,
 "board '": 118919,
 "' famous": 49205,
 'famous theatre': 167653,
 'theatre le': 330986,
 'le month': 223949,
 'month !': 248111,
 '! ticket': 4614,
 'ticket :': 333930,
 ': httâ\x80¦': 82340,
 "@ user 'll": 87687,
 "user 'll 'treading": 348588,
 "'ll 'treading board": 49818,
 "'treading board '": 54882,
 "board ' famous": 118920,
 "' famous theatre": 49206,
 'famous theatre le': 167654,
 'theatre le month': 330987,
 'le month !': 223950,
 'month ! ticket': 248116,
 '! ticket :': 4616,
 'ticket : httâ\x80¦': 333931,
 '#': 5994,
 'sadieschoice': 298509,
 'bedoubleyouproductions': 111957,
 'wish': 371115,
 'u': 343606,
 'fathersday': 168971,
 'today': 335807,
 'music': 

In [19]:
from sklearn.neural_network import MLPClassifier

In [20]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(10,), verbose=True)

In [21]:
mlp_clf.fit(X_train_v, y_train)

Iteration 1, loss = 0.52021059
Iteration 2, loss = 0.24496322
Iteration 3, loss = 0.15434266
Iteration 4, loss = 0.10696322
Iteration 5, loss = 0.07111473
Iteration 6, loss = 0.04700912
Iteration 7, loss = 0.03245416
Iteration 8, loss = 0.02365456
Iteration 9, loss = 0.01813999
Iteration 10, loss = 0.01450783
Iteration 11, loss = 0.01203759
Iteration 12, loss = 0.01026813
Iteration 13, loss = 0.00897823
Iteration 14, loss = 0.00800424
Iteration 15, loss = 0.00725013
Iteration 16, loss = 0.00665283
Iteration 17, loss = 0.00616745
Iteration 18, loss = 0.00576659
Iteration 19, loss = 0.00542748
Iteration 20, loss = 0.00513688
Iteration 21, loss = 0.00488417
Iteration 22, loss = 0.00465950
Iteration 23, loss = 0.00445890
Iteration 24, loss = 0.00427568
Iteration 25, loss = 0.00410852
Iteration 26, loss = 0.00395351
Iteration 27, loss = 0.00380937
Iteration 28, loss = 0.00367403
Iteration 29, loss = 0.00354682
Iteration 30, loss = 0.00342633
Iteration 31, loss = 0.00331167
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [23]:
y_pred = mlp_clf.predict(X_test_v)

In [24]:
#confusion_matrix(y_test, y_pred)

In [25]:
print("roc_auc score: {}".format(roc_auc_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

roc_auc score: 0.7628268427850535
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5945
          1       0.92      0.53      0.67       448

avg / total       0.96      0.96      0.96      6393



In [26]:
X_all = vectorizer.transform(X)
y_all = y

In [27]:
mlp_clf.fit(X_all, y_all)

Iteration 1, loss = 0.45076652
Iteration 2, loss = 0.24601258
Iteration 3, loss = 0.14190490
Iteration 4, loss = 0.08174407
Iteration 5, loss = 0.04972015
Iteration 6, loss = 0.03319932
Iteration 7, loss = 0.02385617
Iteration 8, loss = 0.01817407
Iteration 9, loss = 0.01446235
Iteration 10, loss = 0.01191282
Iteration 11, loss = 0.01009742
Iteration 12, loss = 0.00875664
Iteration 13, loss = 0.00774495
Iteration 14, loss = 0.00696281
Iteration 15, loss = 0.00633324
Iteration 16, loss = 0.00582839
Iteration 17, loss = 0.00540767
Iteration 18, loss = 0.00504763
Iteration 19, loss = 0.00473986
Iteration 20, loss = 0.00446822
Iteration 21, loss = 0.00422901
Iteration 22, loss = 0.00401109
Iteration 23, loss = 0.00381435
Iteration 24, loss = 0.00363354
Iteration 25, loss = 0.00346945
Iteration 26, loss = 0.00331397
Iteration 27, loss = 0.00316922
Iteration 28, loss = 0.00303260
Iteration 29, loss = 0.00290545
Iteration 30, loss = 0.00278692
Iteration 31, loss = 0.00267435
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [34]:
df_test = pd.read_csv("dataset/test.csv")

In [35]:
df_test.sample(5)

Unnamed: 0,id,tweet
12936,44899,could we see a #deadrising4 at #e32016? only t...
14019,45982,@user seriously considered but too poor after ...
9000,40963,@user if only i had a boyfriend to bring me i...
5097,37060,boattrip with all the girls! @user #moderosa #...
233,32196,ð #socialmedia may 2016 was hottest may o...


In [36]:
df_test['tweet'] = df_test['tweet'].apply(lambda x : tweets_lemm(x))

In [37]:
df_test.sample(5)

Unnamed: 0,id,tweet
12338,44301,not long now & amp ; @ user & amp ; i are off ...
3180,35143,@ user fundafield @ user cup team check in sta...
3052,35015,absolutely buzzing for friday ðð©ðð°...
3541,35504,"@ user @ user thank you sooo much , i ca n't w..."
1849,33812,@ user emma stone on hollywood : `` they 've g...


In [38]:
dtest = vectorizer.transform(df_test['tweet'])
test_pred = mlp_clf.predict(dtest)

In [39]:
df_test['label'] = test_pred

In [40]:
submission = df_test[['id','label']]

In [41]:
submission.to_csv('sub_rodixxi_mlpc_lemmatizer_2-1.csv', index=False)