## Libraries

In [64]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import random
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import pickle
import warnings
warnings.filterwarnings('ignore')

## Load train dataset

In [6]:
df=pd.read_csv('train.csv')

## Data cleaning

In [7]:
# This class is used for data cleaning, it has 4 methods and they are used for
# Contractions, stop words, lemmatizer, porterStemmer
class grammer(object):
    def __init__(self,txt):
        self.txt=txt
    
    def contraction(self):
        t = self.txt
        expanded_words = []    
        for word in t.split():
            expanded_words.append(contractions.fix(word))   
        expanded_text = ' '.join(expanded_words)    

        return expanded_text

    def stop_word(self):
        example_sent = self.txt
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(example_sent)
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        expanded_text = ' '.join(filtered_sentence)    
        
        return expanded_text

    def lemmatize(self):
        wnl = WordNetLemmatizer()
        string = self.txt
        list2 = nltk.word_tokenize(string)
        lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list2])

        return lemmatized_string  
    def stem(self):
        ps = PorterStemmer()
        sentence = self.txt
        words = word_tokenize(sentence)
        g=[]  
        for w in words:
            g.append(ps.stem(w))
        final=' '.join(s for s in g)
        
        return final

In [8]:
# This class is used as a pipeline and is inhereting from BaseEstimator and TransformerMixin
# The reason that we need to build this class as a pipeline is that we can also clean our test data with this pipeline
class DataFrameImputer(BaseEstimator,TransformerMixin):

    def __init__(self,variable1):
        self.variable1=variable1
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df= X.copy()
        # removing hashtags
        df[self.variable1]=df[self.variable1].apply(lambda x: x.replace('#',''))
        # removing any @ or links
        df[self.variable1]=df[self.variable1].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)"," ",str(x)).split()))
        # removing digits
        df[self.variable1]=df[self.variable1].apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))
        df[self.variable1]=df[self.variable1].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        # lower case everything
        df[self.variable1]=df[self.variable1].apply(lambda x: " ".join(x.split()))
        df[self.variable1]=df[self.variable1].apply(lambda x: x.lower())
        # apply the grammer class that we defined above
        df[self.variable1]=df[self.variable1].apply(lambda x: grammer(x).contraction())
        df[self.variable1]=df[self.variable1].apply(lambda x: grammer(x).stop_word())
        df[self.variable1]=df[self.variable1].apply(lambda x: grammer(x).lemmatize())
        df[self.variable1]=df[self.variable1].apply(lambda x: grammer(x).stem())

        return df

## Feature engineering

In [9]:
# making the pipeline from the class that we defined above
raw_data=Pipeline(steps=[('df',DataFrameImputer('tweet'))])
# fit and transform our data with the pipeline
df=raw_data.fit_transform(df)
# converting the dataframe into a dictionary
final=df.to_dict(orient='records')

In [10]:
# since our data set had two columns and they were the text and the number
# 0 was positive and 1 was negative
# we make a class and we name the text txt and number num and we also defined sent which is either positive or negative
class convert(object):
    def __init__(self,txt,num):
        self.txt=txt
        self.num=num
        self.sent=self.sentiment()
    def sentiment(self):
        if self.num==0:
            return 'POSITIVE'
        else:
            return 'NEGATIVE'

In [11]:
# in this function we apply the convert class that we defined above and we also make sure 
# to have even amount of positve and negative data, so our model won't be bias
# and also the f1_score will be close for both positive and negative
def even(dic):
    data=[]
    negative=[]
    positive=[]
    for i in dic:
        data.append(convert(i['tweet'],i['label']))
    for i in range(len(data)):
        if data[i].sent=='NEGATIVE':
            negative.append(data[i])
        else:
            positive.append(data[i])

    positive=positive[:len(negative)]
    full=negative+positive
    return full

In [12]:
# applying the function
data=even(final)

# shuffling the data
random.shuffle(data)

## train, test, split

In [19]:
X_train,X_test,y_train,y_test=train_test_split([x.txt for x in data],[x.sent for x in data]
                                               ,test_size=.20
                                               ,random_state=1)

## model building

we will use TfidfVectorizer() as our text vectorizer and RandomForestClassifier() as our classifier

#### Random Forest

In [37]:
# making a pipeline for our models
pipe_rf=Pipeline(steps=[('vec',TfidfVectorizer()),
                     ('rf',RandomForestClassifier())])

# fit
pipe_rf.fit(X_train,y_train)
# predict
y_pred_rf=pipe_rf.predict(X_test)
# f1_score
f1_score(y_test,y_pred_rf,average=None,labels=['POSITIVE','NEGATIVE'])
# we can see that our model is not bias towards any of our variables 
# and it can predict positive and negative almost the same

array([0.86887115, 0.87459106])

#### Cross validation for Random Forest

In [38]:
RF=cross_val_score(pipe_rf,X_train,y_train,cv=10,scoring='accuracy').mean()
RF

0.8305122858343317

#### Parameter tuning for Random Forest

In [35]:
param_rf = {
"rf__bootstrap": [True,False],
"rf__max_depth": list(range(80,220,20)), 
"rf__max_features": ["auto","sqrt"],
"rf__min_samples_split": [2,5,10],
"rf__min_samples_leaf": [1,2,4,6,8,10],
"rf__n_estimators":  [int(x) for x in np.linspace(start = 200, stop = 2000, num =10)]
}


In [36]:
rand_rf=RandomizedSearchCV(pipe_rf,param_rf,cv=10,n_iter=10,scoring='accuracy')
rand_rf.fit(X_train,y_train)
rand_rf.best_score_

0.8338556822956381

In [57]:
model=rand_rf.best_estimator_

#### cross validation of the tuned model

In [60]:
final_cross=cross_val_score(model,X_train,y_train,cv=10,scoring='accuracy').mean()
final_cross

0.8338564603725432

#### evaluate the model on the test set

In [62]:
y_pred=model.predict(X_test)
accuracy_score(y_test,y_pred)

0.8706800445930881

In [63]:
with open('SENTIMENT.pkl','wb') as f:
    pickle.dump(model,f)

# load
with open('SENTIMENT.pkl', 'rb') as f:
    model = pickle.load(f)

### Predicting the test dataset

In [84]:
test=pd.read_csv('test.csv')
test1=test.copy()

# We use the raw_data pipeline that we created earlier to fit_transform the test data 
test=raw_data.fit_transform(test)


In [80]:
# Predicting each tweet 
test['predictions']=test['tweet'].apply(lambda x: model.predict([x]))

In [81]:
# We drop the the modified tweet where we used raw_data pipeline on
test.drop(['tweet'],1,inplace=True)

# We merge the test and test1 so we can have the original tweet text that are not modified
final_df=pd.merge(test,test1,on=['id'],how='inner')

# Converting ndarray to strings
final_df['predictions']=final_df['predictions'].apply(lambda x: ','.join(str(a) for a in x))

In [88]:
final_df.head()

Unnamed: 0,id,predictions,tweet
0,31963,POSITIVE,#studiolife #aislife #requires #passion #dedic...
1,31964,NEGATIVE,@user #white #supremacists want everyone to s...
2,31965,POSITIVE,safe ways to heal your #acne!! #altwaystohe...
3,31966,POSITIVE,is the hp and the cursed child book up for res...
4,31967,POSITIVE,"3rd #bihday to my amazing, hilarious #nephew..."
