In [30]:
import pandas as pd
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
import re
from sklearn.utils import resample

In [31]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print("Traning set: ",df_train.shape)
print("Test set: ",df_test.shape)

Traning set:  (31962, 3)
Test set:  (17197, 2)


In [32]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [33]:
def cleanTweet(text):
    text = text.lower()
    text = re.sub(r'\W',' ',text)
    text = re.sub(r'\s+[a-z]\s+',' ',text)
    text = re.sub(r'\s+',' ',text)
    
    return text

df_train['cleanedTweet'] = df_train['tweet'].apply(lambda x: cleanTweet(x))
df_test['cleanedTweet'] = df_test['tweet'].apply(lambda x:cleanTweet(x))

In [34]:
df_train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

### We can clearly see that the data is imbalanced

In [35]:
##Using Upsampling to balance the data
from sklearn.utils import resample
train_majority = df_train[df_train['label']==0]
train_minority = df_train[df_train['label']==1]
train_minority_upsampled = resample(train_minority,
                                   replace=True,
                                   n_samples=train_majority.shape[0],
                                   )
train_upsampled = pd.concat([train_minority_upsampled,train_majority])

In [36]:
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(max_features=5000,
                            min_df=3,
                            max_df=.6,
                            stop_words=stopwords.words('english')
                            )
X = vectorizer.fit_transform(train_upsampled['cleanedTweet'].tolist()).toarray()

transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()

## Creating Pipeline

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

pipeline_sgd = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('travsformer',TfidfTransformer()),
    ('sgd',SGDClassifier())
])

In [78]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train_upsampled['cleanedTweet'],
                                                train_upsampled['label']
                                                )

In [80]:
model = pipeline_sgd.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [92]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print("Classification Report:")
print(classification_report(y_test,y_pred))
cnf_matrix = confusion_matrix(y_test,y_pred)
print("-"*60)
print("Confusion Matrix")
print(cnf_matrix)
print("-"*60)
print("Accuracy Score: ",accuracy_score(y_test,y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      7327
           1       0.95      0.98      0.97      7533

    accuracy                           0.96     14860
   macro avg       0.97      0.96      0.96     14860
weighted avg       0.97      0.96      0.96     14860

------------------------------------------------------------
Confusion Matrix
[[6932  395]
 [ 127 7406]]
------------------------------------------------------------
Accuracy Score:  0.9648721399730821


## Saving the model in a pickle file

In [94]:
with open("model.pickle","wb") as file:
    pickle.dump(model,file)

## Using the saved model to test the model on un-seen data

In [97]:
with open("model.pickle","rb") as file:
    model = pickle.load(file)

In [98]:
model.predict(df_train['cleanedTweet'])

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [108]:
df_train[df_train['label']==1]['tweet'][34]

"it's unbelievable that in the 21st century we'd need something like this. again. #neverump  #xenophobia "

In [109]:
model.predict(["it's unbelievable that in the 21st century we'd need something like this. again. #neverump  #xenophobia"])

array([1], dtype=int64)

In [113]:
model.predict(["#studiolife #aislife #requires #passion #dedication #willpower   to find #newmaterialsâ\x80¦ "])

array([0], dtype=int64)