# Solution 5 - Pipeline 1 and 2 on Prep 5


In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from  sklearn.metrics  import f1_score
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
dev_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/development.csv")
eval_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/evaluation.csv")


## Preprocessing 5 -  with PorterStemmer including user and date attributes.

In [2]:
#WORK ON DUPLICATES => remove the duplicates
dev_df.drop_duplicates(subset = 'text', inplace = True)

In [3]:
#dropping columns that we won't use
dev_df.drop(['ids','flag'], axis=1, inplace=True)
eval_df.drop(['ids', 'flag'], axis=1, inplace=True)

In [4]:
#creation of a dictionary of negations that splits abbrevations onto their full format.
#creation of the cleaner function in order to preprocess only the text feature of both development and evaluation set
#the function itself handles the HTML decoding, the @, the URLs, the uppercase letters, the negations and tokenizes the tweets
#eliminating the punctuations

negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def cleaner(tweet):
    stemmer = PorterStemmer()
    
    soup = BeautifulSoup(tweet, 'html.parser')
    tweet = soup.get_text()
    tweet = re.sub('@[A-Za-z0-9]+',"",tweet)
    tweet = re.sub(r'https?://[^ ]+', "", tweet)
    tweet = re.sub(r'www.[^ ]+', "", tweet)
    tweet = re.sub("[^a-zA-Z]", " ", tweet)
    tweet_lower = tweet.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], tweet_lower)
    
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(neg_handled)


 
    theTweet=''

    for word in tweet_tokens:
        if (word not in string.punctuation):
            stem_word = stemmer.stem(word)
            theTweet= theTweet+ " " + stem_word
    
    return theTweet



In [5]:
#the attribute 'date' in both development set and evaluation set has type <object>
#the pandas library transform the date object in datetime64[ns] which allows us to extrapolate the part of the datetime
#that we require with the pre-defined pandas functions

dev_df['date']=pd.to_datetime(dev_df['date'])
eval_df['date']=pd.to_datetime(eval_df['date'])



In [6]:
#extraction only of the hour from the datetime which is in format (hour:min:sec)
dev_df['date_hour']=dev_df['date'].dt.hour
eval_df['date_hour']=eval_df['date'].dt.hour

In [7]:
#extraction of the date in the format (year-month-day)
dev_df['date']=dev_df['date'].dt.date
eval_df['date']=eval_df['date'].dt.date

In [8]:
#cleaner function applied for cleaning the text column of the dev dataset
testing = dev_df.text
test_result = []
for t in testing:
    test_result.append(cleaner(t))



In [9]:
#cleaner function applied for cleaning the text column of the eval dataset
eval_testing = eval_df.text
eval_test_result = []
for t in eval_testing:
    eval_test_result.append(cleaner(t))

In [10]:
#appending the clean_text columns in both the development and evaluation dataframe
dev_df['clean_text'] = test_result
eval_df['clean_text'] = eval_test_result

In [11]:
dev_df.drop(['text'], axis=1, inplace=True)
eval_df.drop(['text'], axis=1, inplace=True)

In [12]:
#creation of a new column merging the attributes: text, date, hour and user all in the string format
dev_df['total'] = dev_df['date'].astype(str) + " " + dev_df['date_hour'].astype(str) + " " + dev_df['clean_text'] + " " + dev_df['user']
eval_df['total'] = eval_df['date'].astype(str) + " " + eval_df['date_hour'].astype(str) + " " + eval_df['clean_text'] + " " + eval_df['user']

In [13]:
dev_df.head()

Unnamed: 0,sentiment,date,user,date_hour,clean_text,total
0,1,2009-05-18,Killandra,1,ye talk help a lot go through it there s no j...,2009-05-18 1 ye talk help a lot go through it...
1,1,2009-05-31,IMlisacowan,6,sunshin livingg ittt imma lie on the grass li...,2009-05-31 6 sunshin livingg ittt imma lie on...
2,1,2009-06-01,yaseminx3,11,someth for your iphon,2009-06-01 11 someth for your iphon yaseminx3
3,0,2009-05-17,no_surprises,2,couldn t get in to the after parti,2009-05-17 2 couldn t get in to the after par...
4,0,2009-06-02,Rhi_ShortStack,0,a andi be mean again now i want macca,2009-06-02 0 a andi be mean again now i want ...


In [14]:
eval_df.head()

Unnamed: 0,date,user,date_hour,clean_text,total
0,2009-06-01,urbanperspectiv,21,i m pretti much the same in either world,2009-06-01 21 i m pretti much the same in eit...
1,2009-05-17,therealsecret,11,same here have a gr week ahead,2009-05-17 11 same here have a gr week ahead ...
2,2009-04-19,bitchville,23,that s just nightmar all over,2009-04-19 23 that s just nightmar all over b...
3,2009-06-16,epi_longo,0,ch c ph i i thi i h c th t qu,2009-06-16 0 ch c ph i i thi i h c th t qu ep...
4,2009-05-30,Curiosafmmb,12,sweeti awe ok sweeti ttyl hug,2009-05-30 12 sweeti awe ok sweeti ttyl hug C...


## Pipeline 1 (SGDClassifier algorithm) on Prep 5

In [15]:
#Splitting of the dataset in training and test set fixing the test size of 0.2
X_train, X_test, y_train, y_test = train_test_split(dev_df['total'], dev_df['sentiment'], test_size=0.2)

In [16]:
#attempt of the algorithm SGDClassifier without tuning the hyperparameters
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['total'])

In [17]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep5notuning.csv',index=False)

In [16]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, max_features, ngram_range)
#TfidfTransform (use_idf)
#SGDClassifier (max_iter, alpha, penalty)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'tfidf__use_idf': (True, False),
    "clf__loss": ('hinge', 'log'),
    "clf__alpha": (0.00001, 0.000001),
    'clf__max_iter': (1000, 5000, 10000),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 74.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 115.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 169.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 231.4min
[Parallel(n_jobs=-1)]: Done 2592 out of 2592 | elapsed: 245.4min finished


Best score: 0.842


In [17]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 1e-06
	clf__loss: 'hinge'
	clf__max_iter: 5000
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)


In [21]:
#attempt of the SGDClassifier after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 3), max_features=None)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-06, max_iter=5000, loss='hinge')),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['total'])

In [22]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep5.csv',index=False)

## Pipeline 2 (MultinomialNB algorithm) on Prep 5

In [18]:
#Splitting of the dataset in training and test set fixing the test size of 0.01
X_train, X_test, y_train, y_test = train_test_split(dev_df['total'], dev_df['sentiment'], test_size=0.01, random_state=42)

In [19]:
#attempt of the MultinomialNB without hyperparameters tuning
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [20]:
predicted = text_clf.predict(eval_df['total'])

In [21]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep5notuning.csv',index=False)

In [27]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, ngram_range)
#TfidfTransform
#SGDClassifier (alpha)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'clf__alpha': np.arange(0, 1, 0.05),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=3)
grid_search.fit(dev_df['total'], dev_df['sentiment'])
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.830


In [28]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 0.15000000000000002
	vect__max_df: 0.5
	vect__ngram_range: (1, 3)


In [29]:
#attempt of the MultinomialNB after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.15000000000000002)),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['total'])

In [30]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep5.csv',index=False)