# Import libraries

In [1]:
import numpy as np
import pandas as pd

#Visualization
import seaborn as sns

#For preprocessing
from sklearn.base import TransformerMixin
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import contractions
import re, string, unicodedata
from bs4 import BeautifulSoup


# For building pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

from IPython.display import HTML, display

# Load and describe data

In [2]:
tweets_df = pd.read_csv("Tweets.csv")
tweets_df.shape

(14640, 15)

In [3]:
tweets_df = tweets_df[['text', 'airline_sentiment']]
tweets_df.shape

(14640, 2)

In [4]:
#Check for null data
tweets_df.isnull().sum()

text                 0
airline_sentiment    0
dtype: int64

In [5]:
pd.set_option('display.max_colwidth', None)
tweets_df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials to the experience... tacky.,positive
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,neutral
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",negative
4,@VirginAmerica and it's a really big bad thing about it,negative


In [6]:
blanks = []
for index, txt, sentiment in tweets_df.itertuples():
    if (type(txt) == 'str' and txt.isspace()) or (type(sentiment) == 'str' and sentiment.isspace()):
        blanks.append(index)
        
print(f"The dataset contains {len(blanks)} only space data")

The dataset contains 0 only space data


In [7]:
tweets_df.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Insight

- The provided data does not contain any empty feature or label values
- The data is biased towards negative tweets
- All data in column text and airline_sentiment is just String data

# Data preprocessing

In [8]:
stopwords = stopwords.words('english')

# Following words need to be removed from stopwords because they convey negative sentiments
customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

stopwords = list(set(stopwords) - set(customlist))

In [9]:
class TextPreprocessor(TransformerMixin):
    '''
    This class is used to preprocess the text.
    It extends TranformerMixin class so that it can be part of the pipeline.
    '''
    def __init__(self):
        pass
    
    def fit(self, df, y=None, **fit_params):
        return self
    
    def transform(self, df, **transform_params):
        def helper(txt):
            txt = self.remove_html(txt)
            contractions.fix(txt)
            txt = self.remove_numbers(txt)
            tokens = self.tokenize(txt)
            tokens = self.to_lower(tokens)
            tokens = self.remove_stopwords(tokens)
            tokens = self.remove_punct(tokens)
            tokens = self.lemmetize(tokens)
            return self.join(tokens)

        df_copy = df.copy(deep=True)
        df_copy = df_copy.apply(helper)
        return df_copy
    
    def remove_html(self, txt):
        return BeautifulSoup(txt, "html.parser").get_text()

    def remove_numbers(self, txt):
        return re.sub(r'\d+', '', txt)
    
    def tokenize(self, txt):
        return word_tokenize(txt)
    
    def remove_stopwords(self, lst_tokens):
        return [word for word in lst_tokens if word not in stopwords]
    
    
    def remove_punct(self, lst_token):
        new_lst_tokens = []
        for word in lst_token:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_lst_tokens.append(new_word)
        return new_lst_tokens

    
    def to_lower(self, lst_token):
        return [token.lower() for token in lst_token]
    
    def lemmetize(self, lst_token):
        lemmetizer = WordNetLemmatizer()
        return [lemmetizer.lemmatize(token, pos='v') for token in lst_token]
    
    def join(self, lst_token):
        return ' '.join(lst_token)

In [10]:
preprocessor = TextPreprocessor()

tweets_df_transformed = tweets_df.copy()
tweets_df_transformed.drop('text', axis=1, inplace=True)

tweets_df_transformed['text'] = preprocessor.fit_transform(tweets_df.text)

In [11]:
tweets_df_transformed.head()

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn say
1,positive,virginamerica plus ve add commercials experience tacky
2,neutral,virginamerica nt today must mean need take another trip
3,negative,virginamerica s really aggressive blast obnoxious entertainment guests face little recourse
4,negative,virginamerica s really big bad thing


### Insights

- Based on above, the transformer is working fine and is ready to be used as part of the pipeline.

# Build pipelines

In [12]:
X_train, X_test, y_train, y_test = train_test_split(tweets_df['text'], tweets_df['airline_sentiment'], test_size=0.3, random_state=42)

In [13]:
def build_score(vectorizer, models):
    scores = {}
    max_score = 0.0
    max_y_pred = None

    for model in models:
        pipeline = make_pipeline(TextPreprocessor(), vectorizer, model)
        pipeline.fit(X_train, y_train)
        accuracy_score = np.mean(cross_val_score(pipeline, X_test, y_test, cv=10))
        max_score = max(accuracy_score, max_score)
        scores[model.__class__.__name__] = accuracy_score
        if max_score == accuracy_score:
            max_y_pred = pipeline.predict(X_test)
            
    print(scores)
    print(max_score)
    print(metrics.classification_report(y_test, max_y_pred))
    print(metrics.confusion_matrix(y_test, max_y_pred))

In [14]:
build_score(CountVectorizer(), [
    LogisticRegression(solver='liblinear'), 
    MultinomialNB(), 
    RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=5), 
    GradientBoostingClassifier(random_state=42), 
    LinearSVC(), 
    SVC()
])

{'LogisticRegression': 0.7768632221992131, 'MultinomialNB': 0.7481735348933527, 'RandomForestClassifier': 0.6407113273969767, 'GradientBoostingClassifier': 0.7500031062331746, 'LinearSVC': 0.7443057568854836, 'SVC': 0.7734536135845931}
0.7768632221992131
              precision    recall  f1-score   support

    negative       0.85      0.91      0.88      2814
     neutral       0.64      0.56      0.59       884
    positive       0.77      0.70      0.73       694

    accuracy                           0.80      4392
   macro avg       0.75      0.72      0.73      4392
weighted avg       0.80      0.80      0.80      4392

[[2551  194   69]
 [ 315  491   78]
 [ 120   88  486]]


In [15]:
build_score(TfidfVectorizer(), [
    LogisticRegression(solver='liblinear'), 
    MultinomialNB(), 
    RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=5), 
    GradientBoostingClassifier(random_state=42), 
    LinearSVC(), 
    SVC()
])

{'LogisticRegression': 0.7668492441499275, 'MultinomialNB': 0.671220231932077, 'RandomForestClassifier': 0.6407113273969767, 'GradientBoostingClassifier': 0.7561518948022364, 'LinearSVC': 0.7675264029819838, 'SVC': 0.7695853178711949}
0.7695853178711949
              precision    recall  f1-score   support

    negative       0.81      0.95      0.87      2814
     neutral       0.69      0.46      0.55       884
    positive       0.83      0.62      0.71       694

    accuracy                           0.80      4392
   macro avg       0.78      0.68      0.71      4392
weighted avg       0.79      0.80      0.78      4392

[[2664  110   40]
 [ 428  407   49]
 [ 188   76  430]]


### Insights

- Based on the above analysis, count vectorization and logistic regression works best according to accuracy score
- But if we consider that this study is to correctly identify negative reviews (for improvement), Tfidf vectorizer with SVC works best.
- Count vectorizer and Tfidf vectorizer are computationally very slow. So, we need to use some hyperparameters for better performance

In [16]:
build_score(TfidfVectorizer(max_features=2000, max_df=0.95), [SVC()])

{'SVC': 0.7741354317664112}
0.7741354317664112
              precision    recall  f1-score   support

    negative       0.82      0.94      0.88      2814
     neutral       0.69      0.48      0.57       884
    positive       0.82      0.63      0.71       694

    accuracy                           0.80      4392
   macro avg       0.78      0.69      0.72      4392
weighted avg       0.79      0.80      0.79      4392

[[2655  115   44]
 [ 404  427   53]
 [ 183   73  438]]


# Vader Sentiment analysis

In [17]:
from vader_sentiment.vader_sentiment import SentimentIntensityAnalyzer

In [18]:
analyzer = SentimentIntensityAnalyzer()
tweets_df_transformed['vader_score'] = tweets_df_transformed['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
tweets_df_transformed['vader_sentiment'] = tweets_df_transformed['vader_score'].apply(
    lambda c: 'positive' if c > 0 else ('neutral' if c == 0 else 'negative'))

In [19]:
metrics.accuracy_score(tweets_df_transformed['airline_sentiment'], tweets_df_transformed['vader_sentiment'])

0.5183060109289618

In [20]:
tweets_df['vader_score'] = tweets_df['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
tweets_df['vader_sentiment'] = tweets_df['vader_score'].apply(lambda c: 'positive' if c > 0 else ('neutral' if c == 0 else 'negative'))

In [21]:
metrics.accuracy_score(tweets_df['airline_sentiment'], tweets_df['vader_sentiment'])

0.546448087431694

### Insight

While vader segmentation does not provide good results on both transformed and unprocessed data, it still offers better better prediction that random guessing. <br />
We can conclude that Vader analysis is not suitable for the provided data