In [5]:
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF
from nltk.stem.snowball import SnowballStemmer
import pickle
import pandas as pd
import numpy as np

In [3]:
SEED = 1337

df = pd.read_csv("Tweets.csv")

df.loc[df.airline_sentiment == 'negative', 'airline_sentiment'] = 0
df.loc[df.airline_sentiment == 'neutral', 'airline_sentiment'] = 1
df.loc[df.airline_sentiment == 'positive', 'airline_sentiment'] = 2
airline_le = LabelEncoder()
df['airline'] = airline_le.fit_transform(df.airline)
stemmer = SnowballStemmer("english")
df["text"] = df["text"].apply(lambda x: " ".join([stemmer.stem(y) for y in x.split()]))
print(df.head())

             tweet_id airline_sentiment  airline  retweet_count  \
0  570306133677760513                 1        5              0   
1  570301130888122368                 2        5              0   
2  570301083672813571                 1        5              0   
3  570301031407624196                 0        5              0   
4  570300817074462722                 0        5              0   

                                                text  
0                @virginamerica what @dhepburn said.  
1  @virginamerica plus you'v ad commerci to the e...  
2  @virginamerica i didn't today... must mean i n...  
3  @virginamerica it realli aggress to blast obno...  
4  @virginamerica and it a realli big bad thing a...  


In [6]:
pipe = make_pipeline(TfidfVectorizer(), NMF(), LinearSVC())
params1 = {
          "tfidfvectorizer__min_df": [0.01, 0.1],
          "tfidfvectorizer__max_df": [0.6, 0.8],
          "tfidfvectorizer__sublinear_tf": [True],
          "tfidfvectorizer__use_idf": [True],
          "tfidfvectorizer__analyzer": ["word", "char"],
          #"nmf__random_state": [SEED],
          #"nmf__n_components": [1000, None],
          "linearsvc__random_state":[SEED],
          "linearsvc__C": [0.5, 1.0, 5.0]
          }
model1 = GridSearchCV(pipe, param_grid=params1, verbose=True)

y = df.airline_sentiment.values
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.25, stratify=y, random_state=SEED, shuffle=True)
print("Fitting model...")
model1.fit(df_train.text, y_train.astype(int))
with open("model1.pkl", "wb") as io:
    pickle.dump(model1, io)
print(model1.best_params_)
y_pred = model1.predict(df_test.text)
print(classification_report(y_test.astype(int), y_pred.astype(int)))

Fitting model...
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 29.2min finished


{'linearsvc__C': 0.5, 'linearsvc__random_state': 1337, 'tfidfvectorizer__analyzer': 'word', 'tfidfvectorizer__max_df': 0.8, 'tfidfvectorizer__min_df': 0.01, 'tfidfvectorizer__sublinear_tf': True, 'tfidfvectorizer__use_idf': True}


             precision    recall  f1-score   support

          0       0.80      0.92      0.86      2294
          1       0.61      0.47      0.53       775
          2       0.72      0.53      0.61       591

avg / total       0.75      0.76      0.75      3660

