#                Preprocessing 1 combined with Pipeline 1 and 2:
### The following code provides 2 outputs of CSV files.


In [2]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from  sklearn.metrics  import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from  sklearn.metrics  import f1_score
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
dev_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/development.csv")
eval_df = pd.read_csv("C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/evaluation.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/LENOVO/Downloads/DSL2122_january_dataset/DSL2122_january_dataset/development.csv'

##  Prep 1:

In [11]:
#WORK ON DUPLICATES => remove the duplicates
dev_df.drop_duplicates(subset = 'text', inplace = True)

In [12]:
#drop columns that we won't use
dev_df.drop(['ids', 'date','flag', 'user'], axis=1, inplace=True)
eval_df.drop(['ids', 'date','flag', 'user'], axis=1, inplace=True)

In [13]:
#creation of a dictionary of negations that splits abbrevations onto their full format
#creation of the cleaner function in order to preprocess only the text feature of both development and evaluation set
#the function itself handles the HTML decoding, the @, the URLs, the uppercase letters, the negations and tokenizes the tweets
#eliminating the punctuations

negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def cleaner(tweet):
    
    soup = BeautifulSoup(tweet, 'html.parser')
    tweet = soup.get_text()
    tweet = re.sub('@[A-Za-z0-9]+',"",tweet)
    tweet = re.sub(r'https?://[^ ]+', "", tweet)
    tweet = re.sub(r'www.[^ ]+', "", tweet)
    tweet = re.sub("[^a-zA-Z]", " ", tweet)
    tweet_lower = tweet.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], tweet_lower)
    
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(neg_handled)


    theTweet = ' '
  

    for word in tweet_tokens:
        if (word not in string.punctuation): 
            theTweet=theTweet+ ' '+word
  
    return theTweet

In [14]:
#cleaner function applied to the text column of the development dataset
testing = dev_df.text
test_result = []
for t in testing:
    test_result.append(cleaner(t))




In [15]:
#cleaner function applied to the text column of the evaluation dataset
eval_testing = eval_df.text
eval_test_result = []
for t in eval_testing:
    eval_test_result.append(cleaner(t))

In [16]:
#appending the clean_text columns in both the development and evaluation dataframe
dev_df['clean_text'] = test_result
eval_df['clean_text'] = eval_test_result

## Pipeline 1 (SGDClassifier) on Prep 1:

In [17]:
#Splitting of the dataset in training and test set fixing the test size of 0.2
X_train, X_test, y_train, y_test = train_test_split(dev_df['clean_text'], dev_df['sentiment'], test_size=0.2, random_state=42)

In [39]:
# percentage of negative sentiments in the X_train dataset
"the test set has "+str(len(X_train[y_train==0])/len(X_train)*100)+ "% of negative tweets"

'the test set has 42.0446650680173% of negative tweets'

In [40]:
#percentage of positive sentiments in the X_train dataset
"the test set has "+ str(len(X_train[y_train==1])/len(X_train)*100)+ "% of positive tweets"

'the test set has 57.95533493198269% of positive tweets'

In [18]:
#attempt of the algorithm SGDClassifier without tuning the hyperparameters

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

text_clf.fit(X_train, y_train)


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [19]:
predicted = text_clf.predict(eval_df['clean_text'])

In [20]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep1notuning.csv',index=False)

In [21]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, max_features, ngram_range)
#TfidfTransform (use_idf)
#SGDClassifier (max_iter, alpha, penalty)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'tfidf__use_idf': (True, False),
    "clf__loss": ('hinge', 'log'),
    "clf__alpha": (0.00001, 0.000001),
    'clf__max_iter': (1000, 5000, 10000),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Best score: 0.802


In [22]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 1e-05
	clf__loss: 'hinge'
	clf__max_iter: 1000
	tfidf__use_idf: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [53]:
#attempt of the SGDClassifier after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df= 1.0, max_features=None, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(max_iter=1000, loss='hinge', alpha=1e-05)),
])

text_clf.fit(X_train, y_train)


Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(alpha=1e-05, random_state=42))])

In [54]:
predicted = text_clf.predict(eval_df['clean_text'])

In [55]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputSGDprep1.csv',index=False)

## Pipeline 2 (MultinomialNB) on Prep 2: 

In [21]:
#Splitting of the dataset in training and test set fixing the test size of 0.01
X_train, X_test, y_train, y_test = train_test_split(dev_df['clean_text'], dev_df['sentiment'], test_size=0.01, random_state=42)

In [22]:
#attempt of the MultinomialNB without hyperparameters tuning
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [23]:
predicted = text_clf.predict(eval_df['clean_text'])

In [24]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep1notuning.csv',index=False)

In [11]:
#application of the GridSearchCV for finding the best combination of hyperparameters for all the three algorithms
#CountVectorizer(max_df, ngram_range)
#TfidfTransform
#SGDClassifier (alpha)

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    'clf__alpha': np.arange(0, 1, 0.05),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=3)
grid_search.fit(dev_df['clean_text'], dev_df['sentiment'])
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.783


In [12]:
#Printing the best combination of hyperparameters
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 0.25
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


In [59]:
#attempt of the MultinomialNB after the GridSearch
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range= (1, 2), max_df=0.5)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.25)),
])
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.25))])

In [60]:
predicted = text_clf.predict(eval_df['clean_text'])

In [61]:
df = pd.DataFrame(data= predicted, columns=['Predicted'])
df.insert(0,'Id',df.index)
df
df.to_csv('C:/Users/LENOVO/Downloads/OutputNBprep1.csv',index=False)