# This notebook is to build out model to be pickled for classifying whether post came from FB or Twitter - and make it available for Streamlit

In [18]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import pickle

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
#Read csv
bothf = pd.read_csv('data/bothf.csv')

In [3]:
#Review sample rows
bothf.head()

Unnamed: 0.1,Unnamed: 0,body,subreddit
0,0,Oh because they removed all the features u men...,facebook
1,1,One of my favourites\n\n[https://i.imgur.com/W...,facebook
2,2,i've literally been waiting 4 months. And stil...,facebook
3,3,"Hi, u/AtroopAT8,\n\nUnfortunately, [your submi...",facebook
4,4,I have a warning still on my account that I go...,facebook


In [4]:
#Reviewing mix
print(bothf['subreddit'].value_counts())

Twitter     9207
facebook    9196
Name: subreddit, dtype: int64


# Above confirms that mix of Twitter and facebook hasn't changed much and is still balanced

In [5]:
#Dropping Unnamed column
bothf.drop(columns = ['Unnamed: 0'], inplace=True)

In [6]:
bothf.head()

Unnamed: 0,body,subreddit
0,Oh because they removed all the features u men...,facebook
1,One of my favourites\n\n[https://i.imgur.com/W...,facebook
2,i've literally been waiting 4 months. And stil...,facebook
3,"Hi, u/AtroopAT8,\n\nUnfortunately, [your submi...",facebook
4,I have a warning still on my account that I go...,facebook


# Below code is to tokenize, then lemmatize to remove similar words and then rejoin again

In [7]:
#Tokenizing and lemmatizing

def tok_lem(dataset, var):
    lemmatizer = WordNetLemmatizer()
    dataset_copy = dataset.copy()
    new_text = []
    for i in dataset_copy[var]:
        tokens = word_tokenize(i.lower())
        tokens_lem = [lemmatizer.lemmatize(token) for token in tokens]
        new_review = " ".join(tokens_lem)
        new_text.append(new_review)
    dataset_copy[var] = new_text
    return dataset_copy

In [8]:
bothf = tok_lem(bothf, 'body')

In [9]:
bothf.head(3)

Unnamed: 0,body,subreddit
0,oh because they removed all the feature u ment...,facebook
1,one of my favourite [ http : //i.imgur.com/wxb...,facebook
2,i 've literally been waiting 4 month . and sti...,facebook


In [10]:
#Defining X and y
X = bothf['body']
y = bothf['subreddit']

## Modelling exercise with Logistic Regression below

In [11]:
# Use different split for test/train so that it is not impacted by CVEC from above
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [12]:
# Set up a pipeline with tf-idf vectorizer and Logistic Regression
pipe_tvec_lr = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])

## Define what hyperparameters to test in the pipe

In [35]:
# Search over the following values of hyperparameters:
# No updated stop words and english stop words
# Check (individual tokens) and also check (individual tokens and 2:3-grams).

stopwords = nltk.corpus.stopwords.words('english')
#newStopWords = ['facebook', 'Facebook', 'fb','twitter', 'Twitter'] #Let's remove this to increase chances of working
#stopwords.extend(newStopWords)

pipe_tvec_lr_params = {
    'tvec__max_features': [8_000, 11_000, 15_000, 19_000],
    'tvec__stop_words': [stopwords],
    'tvec__ngram_range': [(1,2), (1,3)],
        'logreg__penalty' : ['l1', 'l2'],
    'logreg__C' : [0.05, 0.1, 0.15],
    'logreg__solver':['liblinear']
}

In [36]:
# Instantiate GridSearchCV.
gs_tvec_lr = GridSearchCV(pipe_tvec_lr, # what object are we optimizing?
                        param_grid = pipe_tvec_lr_params, # what parameters values are we searching?
                        cv=5) # 5-fold cross-validation.

In [37]:
# Fit GridSearch to training data.
gs_tvec_lr.fit(X_train2, y_train2)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'logreg__C': [0.05, 0.1, 0.15],
                         'logreg__penalty': ['l1', 'l2'],
                         'logreg__solver': ['liblinear'],
                         'tvec__max_features': [8000, 11000, 15000, 19000],
                         'tvec__ngram_range': [(1, 2), (1, 3)],
                         'tvec__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's",

In [38]:
#Get scores for train and test
print(f'Train score using tvec/logreg is {gs_tvec_lr.score(X_train2, y_train2)}')
print(f'Test score using tvec/logreg is {gs_tvec_lr.score(X_test2, y_test2)}')

Train score using tvec/logreg is 0.8753803796551225
Test score using tvec/logreg is 0.8109106715931319


In [46]:
#Testing that predictions work as expected
sr = pd.Series(['post','Tweets are not working', 'What is wrong with messenger?'])
gs_tvec_lr.predict(sr)

array(['facebook', 'Twitter', 'facebook'], dtype=object)

In [48]:
sr = pd.Series('tweet')
gs_tvec_lr.predict(sr)[0]

'Twitter'

In [43]:
# Creating the pickled file in model.p so that it can be used for Streamlit
pickle.dump(gs_tvec_lr, open('model.p', 'wb'))