# Solution 3 - Pipeline 1 and Pipeline 2 with Prep 3


In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from  sklearn.metrics  import f1_score
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
dev_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/development.csv")
eval_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/evaluation.csv")


## Prep 3 - using SnowBall Stemmer

In [2]:
#WORK ON DUPLICATES => remove the duplicates
dev_df.drop_duplicates(subset = 'text', inplace = True)

In [3]:
#dropping columns that we won't use
dev_df.drop(['ids', 'date','flag', 'user'], axis=1, inplace=True)
eval_df.drop(['ids', 'date','flag', 'user'], axis=1, inplace=True)

In [4]:
#creation of a dictionary of negations that splits abbrevations onto their full format.
#creation of the cleaner function in order to preprocess only the text feature of both development and evaluation set
#the function itself handles the HTML decoding, the @, the URLs, the uppercase letters, the negations and tokenizes the tweets
#eliminating the punctuations

negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def cleaner(tweet):
    stemmer = SnowballStemmer(language='english')
    
    soup = BeautifulSoup(tweet, 'html.parser')
    tweet = soup.get_text()
    tweet = re.sub('@[A-Za-z0-9]+',"",tweet)
    tweet = re.sub(r'https?://[^ ]+', "", tweet)
    tweet = re.sub(r'www.[^ ]+', "", tweet)
    tweet = re.sub("[^a-zA-Z]", " ", tweet)
    tweet_lower = tweet.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], tweet_lower)
    
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(neg_handled)


 
    theTweet=''

    for word in tweet_tokens:
        if (word not in string.punctuation):
            stem_word = stemmer.stem(word)
            theTweet= theTweet+ " " + stem_word
    
    return theTweet

In [5]:
#cleaner function applied to the text column of the development dataset
testing = dev_df.text
test_result = []
for t in testing:
    test_result.append(cleaner(t))



In [6]:
#cleaner function applied to the text column of the evaluation dataset
eval_testing = eval_df.text
eval_test_result = []
for t in eval_testing:
    eval_test_result.append(cleaner(t))

In [7]:
#appending the clean_text columns in both the development and evaluation dataframe
dev_df['clean_text'] = test_result
eval_df['clean_text'] = eval_test_result

In [8]:
dev_df.head()

Unnamed: 0,sentiment,text,clean_text
0,1,"@MissBianca76 Yes, talking helps a lot.. going...",yes talk help a lot go through it there s no ...
1,1,SUNSHINE. livingg itttt. imma lie on the grass...,sunshin livingg ittt imma lie on the grass li...
2,1,@PleaseBeMine Something for your iphone,someth for your iphon
3,0,@GabrielSaporta couldn't get in to the after p...,couldn t get in to the after parti
4,0,@bradiewebbstack awww is andy being mean again...,a andi be mean again now i want macca


## Pipeline 1 (SGDClassifier algorithm) on Prep 3

In [9]:
#Splitting of the dataset in training and test set fixing the test size of 0.2
X_train, X_test, y_train, y_test = train_test_split(dev_df['clean_text'], dev_df['sentiment'], test_size=0.2, random_state=42)

In [10]:
#attempt of the algorithm SGDClassifier without tuning the hyperparameters
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['clean_text'])

In [11]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep3notuning.csv',index=False)

In [13]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, max_features, ngram_range)
#TfidfTransform (use_idf)
#SGDClassifier (max_iter, alpha, penalty)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'tfidf__use_idf': (True, False),
    "clf__loss": ('hinge', 'log'),
    "clf__alpha": (0.00001, 0.000001),
    'clf__max_iter': (1000, 5000, 10000),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Best score: 0.802


In [14]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 1e-05
	clf__loss: 'hinge'
	clf__max_iter: 5000
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__max_features: None
	vect__ngram_range: (1, 3)


In [39]:
#attempt of the SGDClassifier after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 3), max_features=None)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(alpha=1e-05, max_iter=5000, loss='hinge')),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['clean_text'])

In [40]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep3.csv',index=False)

## Pipeline 2 (MultinomialNB algorithm) on Prep 3

In [12]:
#Splitting of the dataset in training and test set fixing the test size of 0.01
X_train, X_test, y_train, y_test = train_test_split(dev_df['clean_text'], dev_df['sentiment'], test_size=0.01, random_state=42)

In [13]:
#attempt of the MultinomialNB without hyperparameters tuning
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [14]:
predicted = text_clf.predict(eval_df['clean_text'])

In [15]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep3notuning.csv',index=False)

In [28]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, ngram_range)
#TfidfTransform
#SGDClassifier (alpha)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'clf__alpha': np.arange(0, 1, 0.05),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=3)
grid_search.fit(dev_df['clean_text'], dev_df['sentiment'])
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.782


In [29]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 0.25
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


In [42]:
#attempt of the MultinomialNB after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2), max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.25)),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(eval_df['clean_text'])

In [43]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep3.csv',index=False)