In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/train.csv", index_col=0)

In [3]:
df.sample(5)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10276,0,â #usd/cad trims gains to 1.2720 ahead of d...
17967,0,it's already half way to 2017 and i still don'...
7083,0,34 weeks #thiy #four #weeks #testosterone #f...
8171,1,this is a completely justified call out on my ...
9560,0,@user ...#ourleadersaresleeping ... ... #diss...


In [4]:
df["label"].unique()

array([0, 1])

In [5]:
df["tweet"].nunique()

29530

In [6]:
df.size

63924

In [7]:
df["tweet"].value_counts()

#model   i love u take with u all the time in urð±!!! ðððð
ð¦ð¦ð¦                                                        319
i finally found a way how to delete old tweets! you might find it useful as well:    #deletetweets                                           82
aww yeah it's all good bing bong bing bong                                                                                                   75
i'm so   and #grateful now that - #affirmations                                                                                              56
@user you might be a libtard if... #libtard  #sjw #liberal #politics                                                                         40
you might be a libtard if... #libtard  #sjw #liberal #politics                                                                               32
ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme   #followâ¦                                          

In [8]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def tweets_lemm(tweet):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(tweet)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

In [10]:
df['tweet'] = df['tweet'].apply(lambda x : tweets_lemm(x))

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df["tweet"]
y = df["label"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [15]:
nltk.download('punkt')
nltk.download('stopwords')
stops = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/rcrespillo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rcrespillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
vectorizer = TfidfVectorizer(analyzer='word', 
                             tokenizer=nltk.word_tokenize, 
                             stop_words=stops, 
                             ngram_range=(1,3))

X_train_v = vectorizer.fit_transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [17]:
X_train_v.shape

(25569, 390316)

In [18]:
vectorizer.vocabulary_

{'positive': 279083,
 '.': 64365,
 '#': 5970,
 'i_am': 207230,
 'affirmation': 94955,
 'positive .': 279134,
 '. #': 64372,
 '# i_am': 24275,
 'i_am #': 207231,
 '# positive': 35723,
 'positive #': 279086,
 '# affirmation': 7429,
 'positive . #': 279135,
 '. # i_am': 65045,
 '# i_am #': 24276,
 'i_am # positive': 207232,
 '# positive #': 35725,
 'positive # affirmation': 279087,
 'hope': 204523,
 'weekend': 366333,
 'happy': 196837,
 'beautiful': 110832,
 '!': 0,
 ':': 81418,
 ')': 56095,
 'salonlife': 298179,
 'stylist': 320737,
 'vermontsalon': 357856,
 '...': 71374,
 'hope weekend': 204775,
 'weekend happy': 366619,
 'happy beautiful': 197129,
 'beautiful !': 110833,
 '! !': 1,
 '! :': 2484,
 ': )': 81675,
 ') #': 56102,
 '# beautiful': 9892,
 'beautiful #': 110839,
 '# salonlife': 38435,
 'salonlife #': 298180,
 '# stylist': 41644,
 'stylist #': 320738,
 '# vermontsalon': 45624,
 'vermontsalon #': 357857,
 '# weekend': 46468,
 'weekend ...': 366502,
 'hope weekend happy': 204777,
 

In [20]:
from sklearn import decomposition

In [21]:
svd = decomposition.TruncatedSVD(n_components=2)

In [23]:
svd.fit(X_train_v)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [23]:
y_pred = mlp_clf.predict(X_test_v)

In [24]:
#confusion_matrix(y_test, y_pred)

In [25]:
print("roc_auc score: {}".format(roc_auc_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

roc_auc score: 0.7628268427850535
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      5945
          1       0.92      0.53      0.67       448

avg / total       0.96      0.96      0.96      6393



In [26]:
X_all = vectorizer.transform(X)
y_all = y

In [27]:
mlp_clf.fit(X_all, y_all)

Iteration 1, loss = 0.45076652
Iteration 2, loss = 0.24601258
Iteration 3, loss = 0.14190490
Iteration 4, loss = 0.08174407
Iteration 5, loss = 0.04972015
Iteration 6, loss = 0.03319932
Iteration 7, loss = 0.02385617
Iteration 8, loss = 0.01817407
Iteration 9, loss = 0.01446235
Iteration 10, loss = 0.01191282
Iteration 11, loss = 0.01009742
Iteration 12, loss = 0.00875664
Iteration 13, loss = 0.00774495
Iteration 14, loss = 0.00696281
Iteration 15, loss = 0.00633324
Iteration 16, loss = 0.00582839
Iteration 17, loss = 0.00540767
Iteration 18, loss = 0.00504763
Iteration 19, loss = 0.00473986
Iteration 20, loss = 0.00446822
Iteration 21, loss = 0.00422901
Iteration 22, loss = 0.00401109
Iteration 23, loss = 0.00381435
Iteration 24, loss = 0.00363354
Iteration 25, loss = 0.00346945
Iteration 26, loss = 0.00331397
Iteration 27, loss = 0.00316922
Iteration 28, loss = 0.00303260
Iteration 29, loss = 0.00290545
Iteration 30, loss = 0.00278692
Iteration 31, loss = 0.00267435
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [34]:
df_test = pd.read_csv("dataset/test.csv")

In [35]:
df_test.sample(5)

Unnamed: 0,id,tweet
12936,44899,could we see a #deadrising4 at #e32016? only t...
14019,45982,@user seriously considered but too poor after ...
9000,40963,@user if only i had a boyfriend to bring me i...
5097,37060,boattrip with all the girls! @user #moderosa #...
233,32196,ð #socialmedia may 2016 was hottest may o...


In [36]:
df_test['tweet'] = df_test['tweet'].apply(lambda x : tweets_lemm(x))

In [37]:
df_test.sample(5)

Unnamed: 0,id,tweet
12338,44301,not long now & amp ; @ user & amp ; i are off ...
3180,35143,@ user fundafield @ user cup team check in sta...
3052,35015,absolutely buzzing for friday ðð©ðð°...
3541,35504,"@ user @ user thank you sooo much , i ca n't w..."
1849,33812,@ user emma stone on hollywood : `` they 've g...


In [38]:
dtest = vectorizer.transform(df_test['tweet'])
test_pred = mlp_clf.predict(dtest)

In [39]:
df_test['label'] = test_pred

In [40]:
submission = df_test[['id','label']]

In [41]:
submission.to_csv('sub_rodixxi_mlpc_lemmatizer_2-1.csv', index=False)