In [None]:
#Required Libraries
import numpy as np 
import pandas as pd  
import re
import csv
import nltk
import spacy
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

# Style
import matplotlib.style as style 
sns.set(font_scale=2)
style.use('seaborn-pastel')
style.use('seaborn-poster')
from PIL import Image
from wordcloud import WordCloud

# Preprocessing
import en_core_web_sm
from collections import Counter
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet  
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Building classification models
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

In [None]:
#Dataset
df_test = pd.read_csv('../input/edsa-sentiment-classification/test.csv')
df_train = pd.read_csv('../input/edsa-sentiment-classification/train.csv')


In [None]:
print(df_test.shape)
print(df_train.shape)

display(df_test.head())
display(df_train.head())

percent_duplicates = round((1-(df_train['message'].nunique()/len(df_train['message'])))*100,2)
print('Duplicated tweets in train data:')
print(percent_duplicates,'%')

In [None]:
def modifyDf():
    df = df_train.copy()
    sentiment = df['sentiment']
    
    word_sentiment = []    
    for index in sentiment :
        if index == 1 :
            word_sentiment.append('Pro')
        elif index == 0 :
            word_sentiment.append('Neutral')
        elif index == -1 :
            word_sentiment.append('Anti')
        else :
            word_sentiment.append('News')
            
    df['sentiment'] = word_sentiment
    
    return df

df_train_copy = modifyDf()
df_train_copy.head()

In [None]:
def hashtag_extract(tweet):
    hashtags = []
       
    for i in tweet:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)
        
    hashtags = sum(hashtags, [])
    frequency = nltk.FreqDist(hashtags)
    
    hashtag_df = pd.DataFrame({'hashtag': list(frequency.keys()),'count': list(frequency.values())})
    hashtag_df = hashtag_df.nlargest(15, columns="count")

    return hashtag_df

pro = hashtag_extract(df_train_copy['message'][df_train_copy['sentiment'] == 'Pro'])
anti = hashtag_extract(df_train_copy['message'][df_train_copy['sentiment'] == 'Anti'])
neutral = hashtag_extract(df_train_copy['message'][df_train_copy['sentiment'] == 'Neutral'])
news = hashtag_extract(df_train_copy['message'][df_train_copy['sentiment'] == 'News'])

pro.head()

In [None]:
def CleanTweets(tweet):
    tweet = tweet.lower()
    tweet = re.sub('@[\w]*','',tweet)
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    tweet = re.sub(r'#\w*', '', tweet)    
    tweet = re.sub(r'\d+', '', tweet)  
    tweet = re.sub(r"[,.;':@#?!\&/$]+\ *", ' ', tweet)
    tweet = re.sub(r"U+FFFD ", ' ', tweet)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    tweet = re.sub(r'\s\s+', '  ', tweet)
    tweet = tweet.lstrip(' ')                        

    return tweet

df_train_copy['message'] = df_train_copy['message'].apply(CleanTweets)

df_train_copy.head()
    

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV    
    else:
        return wordnet.NOUN

def lemma(df):
    df['length'] = df['message'].str.len()
    df['tokenized'] = df['message'].apply(word_tokenize)
    df['pos_tags'] = df['tokenized'].apply(nltk.tag.pos_tag)
        
    wnl = WordNetLemmatizer() 
    return df

df_train_copy = lemma(df_train_copy)
df_train_copy.head()

In [None]:
# Seperate features and tagret variables
X = df_train['message']
y = df_train['sentiment']

# Split the train data to create validation dataset
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Random Forest Classifier
rf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', RandomForestClassifier(max_depth=5, n_estimators=100))])

# Train Random forest 
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_valid)

# Generate a classification Report for the random forest model
print(metrics.classification_report(y_valid, y_pred_rf))

In [None]:
# Naïve Bayes:
nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

# Train Niave bayes
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_valid)

# Generate a classification Report for the Naive Bayes model
print(metrics.classification_report(y_valid, y_pred_nb))

In [None]:
# K-NN Classifier
knn = Pipeline([('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2))])

# Train K - nearest neighbors
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_valid)

print(metrics.classification_report(y_valid, y_pred_knn))

In [None]:
# Logistic Regression
lr = Pipeline([('tfidf',TfidfVectorizer()), ('clf',LogisticRegression(C=1, class_weight='balanced', max_iter=1000))])

# Train Linear regression
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_valid)

print(metrics.classification_report(y_valid, y_pred_lr))

In [None]:
# Linear SVC:
lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC(class_weight='balanced'))])

# Train Linear SVC
lsvc.fit(X_train, y_train)
y_pred_lsvc = lsvc.predict(X_valid)

print(metrics.classification_report(y_valid, y_pred_lsvc))

In [None]:
# Retrain linear SVC using optimal hyperparameters:
lsvc_op = Pipeline([('tfidf', TfidfVectorizer(max_df=0.8,
                                                    min_df=2,
                                                    ngram_range=(1,2))),
                  ('clf', LinearSVC(C=0.3,
                                    class_weight='balanced',
                                    max_iter=3000))])

# Fit and predict
lsvc_op.fit(X_train, y_train)
y_pred = lsvc_op.predict(X_valid)

print('F1 score improved by',
      round(100*((metrics.accuracy_score(y_pred, y_valid) - metrics.accuracy_score(y_pred_lsvc, y_valid)) /metrics.accuracy_score(y_pred_lsvc, y_valid)),0), 
      '%')

In [None]:
y_test = lsvc_op.predict(df_test['message'])
output = pd.DataFrame({'tweetid': df_test.tweetid, 'sentiment': y_test})
output.to_csv('result_data.csv', index=False)
output