In [51]:
import pandas as pd
import nltk
import re
from collections import Counter

In [52]:
df_train = pd.read_csv('../datasets/Tweets-train.csv')
df = df_train[['text','airline_sentiment']]
df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica I didn't today... Must mean I n...,neutral
2,@VirginAmerica it's really aggressive to blast...,negative
3,@VirginAmerica and it's a really big bad thing...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [53]:
tags = df['text'].str.extractall(r'(@\w+)')[0]

In [54]:
df_neutral = df[df['airline_sentiment'] == 'neutral']
df_positive = df[df['airline_sentiment'] == 'positive']
df_negative = df[df['airline_sentiment'] == 'negative']

In [55]:
def preprocess_tweet(tweet_text):
    # Remove URLs (http/https)
    tweet_text = re.sub(r'http\S+|www\S+', '', tweet_text)
    
    # Remove punctuation marks
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text)
    
    # Remove emoticons
    emoticons = [':)', ':(', ':D', ':P', ':/', ':|']
    for emoticon in emoticons:
        tweet_text = tweet_text.replace(emoticon, '')
    
    # Remove '@' mentions
    tweet_text = re.sub(r'@\w+', '', tweet_text)
    
    return tweet_text.strip()

In [56]:
df_neutral['cleaned_tweet'] = df_neutral['text'].apply(preprocess_tweet)
df_negative['cleaned_tweet'] = df_negative['text'].apply(preprocess_tweet)
df_positive['cleaned_tweet'] = df_positive['text'].apply(preprocess_tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neutral['cleaned_tweet'] = df_neutral['text'].apply(preprocess_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_negative['cleaned_tweet'] = df_negative['text'].apply(preprocess_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_positive['cleaned_tweet'] = df_positive['text'].apply(

In [57]:
df_positive.head()

Unnamed: 0,text,airline_sentiment,cleaned_tweet
0,@VirginAmerica plus you've added commercials t...,positive,VirginAmerica plus youve added commercials to ...
4,"@VirginAmerica yes, nearly every time I fly VX...",positive,VirginAmerica yes nearly every time I fly VX t...
5,"@virginamerica Well, I didn't…but NOW I DO! :-D",positive,virginamerica Well I didntbut NOW I DO D
6,"@VirginAmerica it was amazing, and arrived an ...",positive,VirginAmerica it was amazing and arrived an ho...
8,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive,VirginAmerica I lt3 pretty graphics so much be...


In [58]:
def get_most_common_words(tweet_text, num_words=15):
    # Tokenize words in each tweet
    tokens = nltk.word_tokenize(tweet_text.lower())
    
    # Count word frequencies
    word_freq = Counter(tokens)
    
    # Get the most common words
    most_common_words = word_freq.most_common(num_words)
    
    return most_common_words

In [59]:
neutral_txt = ' '.join(df_neutral['cleaned_tweet'])
positive_txt = ' '.join(df_positive['cleaned_tweet'])
negative_txt = ' '.join(df_negative['cleaned_tweet'])

In [60]:
print("15 Most Common words in Neutral Sentiment: ", get_most_common_words(neutral_txt))
print("15 Most Common words in Positive Sentiment: ", get_most_common_words(positive_txt))
print("15 Most Common words in Negative Sentiment: ", get_most_common_words(negative_txt))

15 Most Common words in Neutral Sentiment:  [('to', 1184), ('i', 843), ('the', 729), ('a', 588), ('you', 530), ('united', 527), ('jetblue', 522), ('southwestair', 487), ('on', 472), ('for', 442), ('flight', 422), ('my', 387), ('is', 371), ('americanair', 363), ('in', 344)]
15 Most Common words in Positive Sentiment:  [('the', 689), ('to', 675), ('you', 632), ('for', 493), ('i', 448), ('thanks', 446), ('jetblue', 438), ('southwestair', 421), ('united', 376), ('a', 373), ('thank', 335), ('and', 304), ('my', 262), ('flight', 261), ('americanair', 254)]
15 Most Common words in Negative Sentiment:  [('to', 4318), ('the', 3013), ('i', 2605), ('a', 2300), ('united', 2085), ('flight', 2070), ('and', 2039), ('on', 2005), ('for', 1996), ('you', 1806), ('usairways', 1722), ('my', 1718), ('americanair', 1544), ('is', 1525), ('in', 1254)]


In [61]:
def remove_stop_words_from_tweet(tweet_text):
    words = nltk.word_tokenize(tweet_text)
                          
    filtered_words = [word.lower() for word in words if word.lower() not in nltk.corpus.stopwords.words('english')]
    
    return ' '.join(filtered_words)

In [62]:
df_neutral['cleaned_tweet_without_stopwords'] = df_neutral['cleaned_tweet'].apply(remove_stop_words_from_tweet)
df_negative['cleaned_tweet_without_stopwords'] = df_negative['cleaned_tweet'].apply(remove_stop_words_from_tweet)
df_positive['cleaned_tweet_without_stopwords'] = df_positive['cleaned_tweet'].apply(remove_stop_words_from_tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neutral['cleaned_tweet_without_stopwords'] = df_neutral['cleaned_tweet'].apply(remove_stop_words_from_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_negative['cleaned_tweet_without_stopwords'] = df_negative['cleaned_tweet'].apply(remove_stop_words_from_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [63]:
neutral_txt = ' '.join(df_neutral['cleaned_tweet_without_stopwords'])
positive_txt = ' '.join(df_positive['cleaned_tweet_without_stopwords'])
negative_txt = ' '.join(df_negative['cleaned_tweet_without_stopwords'])

In [64]:
print("15 Most Common words in Neutral Sentiment: ", get_most_common_words(neutral_txt))
print("15 Most Common words in Positive Sentiment: ", get_most_common_words(positive_txt))
print("15 Most Common words in Negative Sentiment: ", get_most_common_words(negative_txt))

15 Most Common words in Neutral Sentiment:  [('united', 527), ('jetblue', 522), ('southwestair', 487), ('flight', 422), ('americanair', 363), ('usairways', 302), ('get', 173), ('please', 130), ('virginamerica', 129), ('flights', 128), ('help', 118), ('need', 113), ('thanks', 113), ('im', 106), ('would', 92)]
15 Most Common words in Positive Sentiment:  [('thanks', 446), ('jetblue', 438), ('southwestair', 421), ('united', 376), ('thank', 335), ('flight', 261), ('americanair', 254), ('usairways', 199), ('great', 162), ('service', 119), ('virginamerica', 114), ('love', 98), ('best', 85), ('customer', 85), ('good', 81)]
15 Most Common words in Negative Sentiment:  [('united', 2085), ('flight', 2070), ('usairways', 1722), ('americanair', 1544), ('southwestair', 882), ('jetblue', 753), ('get', 721), ('cancelled', 644), ('service', 523), ('hours', 492), ('help', 437), ('customer', 420), ('hold', 417), ('time', 415), ('2', 415)]


In [65]:
words_to_remove = ['americanair', 'united', 'delta', 'southwestair', 'jetblue', 'virginamerica', 'usairways', 'flight', 'plane']

def remove_given_words_from_tweet(tweet_text):
    words = nltk.word_tokenize(tweet_text)
                          
    filtered_words = [word.lower() for word in words if word.lower() not in words_to_remove]
    
    return ' '.join(filtered_words)

In [66]:
df_neutral['cleaned_tweet_without_given_words'] = df_neutral['cleaned_tweet_without_stopwords'].apply(remove_given_words_from_tweet)
df_negative['cleaned_tweet_without_given_words'] = df_negative['cleaned_tweet_without_stopwords'].apply(remove_given_words_from_tweet)
df_positive['cleaned_tweet_without_given_words'] = df_positive['cleaned_tweet_without_stopwords'].apply(remove_given_words_from_tweet)

neutral_txt = ' '.join(df_neutral['cleaned_tweet_without_given_words'])
positive_txt = ' '.join(df_positive['cleaned_tweet_without_given_words'])
negative_txt = ' '.join(df_negative['cleaned_tweet_without_given_words'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neutral['cleaned_tweet_without_given_words'] = df_neutral['cleaned_tweet_without_stopwords'].apply(remove_given_words_from_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_negative['cleaned_tweet_without_given_words'] = df_negative['cleaned_tweet_without_stopwords'].apply(remove_given_words_from_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [67]:
print("15 Most Common words in Neutral Sentiment: ", get_most_common_words(neutral_txt))
print("15 Most Common words in Positive Sentiment: ", get_most_common_words(positive_txt))
print("15 Most Common words in Negative Sentiment: ", get_most_common_words(negative_txt))

15 Most Common words in Neutral Sentiment:  [('get', 173), ('please', 130), ('flights', 128), ('help', 118), ('need', 113), ('thanks', 113), ('im', 106), ('would', 92), ('dm', 82), ('time', 76), ('cancelled', 74), ('know', 72), ('tomorrow', 72), ('us', 70), ('fleek', 70)]
15 Most Common words in Positive Sentiment:  [('thanks', 446), ('thank', 335), ('great', 162), ('service', 119), ('love', 98), ('best', 85), ('customer', 85), ('good', 81), ('guys', 81), ('much', 77), ('get', 76), ('got', 71), ('awesome', 70), ('amazing', 63), ('help', 63)]
15 Most Common words in Negative Sentiment:  [('get', 721), ('cancelled', 644), ('service', 523), ('hours', 492), ('help', 437), ('customer', 420), ('hold', 417), ('time', 415), ('2', 415), ('im', 398), ('delayed', 361), ('still', 352), ('us', 352), ('amp', 350), ('cant', 348)]


In [74]:
df_processed = pd.concat([df_negative, df_neutral, df_positive], ignore_index=True)
df_processed = df_processed[['airline_sentiment', 'cleaned_tweet_without_given_words']]
df_processed.head()

Unnamed: 0,airline_sentiment,cleaned_tweet_without_given_words
0,negative,really aggressive blast obnoxious entertainmen...
1,negative,really big bad thing
2,negative,sfopdx schedule still mia
3,negative,first fares may three times carriers seats ava...
4,negative,guys messed seating reserved seating friends g...


In [75]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
df_processed['airline_sentiment'] = le.fit_transform(df_processed['airline_sentiment'])
df_processed.head()

Unnamed: 0,airline_sentiment,cleaned_tweet_without_given_words
0,0,really aggressive blast obnoxious entertainmen...
1,0,really big bad thing
2,0,sfopdx schedule still mia
3,0,first fares may three times carriers seats ava...
4,0,guys messed seating reserved seating friends g...


In [76]:
df_processed['airline_sentiment'].unique()

array([0, 1, 2])

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

x_train = cv.fit_transform(df_processed['cleaned_tweet_without_given_words']).toarray()
y_train = df_processed['airline_sentiment']

In [81]:
cv.get_feature_names_out()

array(['0016', '006', '0162389030167', ..., 'zukes', 'zurich',
       'zurichnew'], dtype=object)

In [82]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(x_train, y_train) # training

In [86]:
df_test = pd.read_csv('../datasets/Tweets-test.csv')
df_test = df_test[['text','airline_sentiment']]
df_test['text'] = df_test['text'].apply(preprocess_tweet)
df_test['text'] = df_test['text'].apply(remove_stop_words_from_tweet)
df_test['text'] = df_test['text'].apply(remove_given_words_from_tweet)
df_test.head()

Unnamed: 0,text,airline_sentiment
0,drop call dont people answering phones always ...,negative
1,thanks seat doesnt recline im shocked im asked...,negative
2,wasnt delay counter wouldnt take valid cac car...,negative
3,social media team point oscarnight,positive
4,birds could fly south america example argentina,neutral


In [87]:
df_test['airline_sentiment'] = le.transform(df_test['airline_sentiment'])
df_test.head()

Unnamed: 0,text,airline_sentiment
0,drop call dont people answering phones always ...,0
1,thanks seat doesnt recline im shocked im asked...,0
2,wasnt delay counter wouldnt take valid cac car...,0
3,social media team point oscarnight,2
4,birds could fly south america example argentina,1


In [88]:
x_test = cv.transform(df_test['text']).toarray()
y_test = df_test['airline_sentiment']

In [89]:
y_pred = nb.predict(x_test)

df_test['airline_sentiment_predicted'] = y_pred
df_test.head()

Unnamed: 0,text,airline_sentiment,airline_sentiment_predicted
0,drop call dont people answering phones always ...,0,0
1,thanks seat doesnt recline im shocked im asked...,0,0
2,wasnt delay counter wouldnt take valid cac car...,0,0
3,social media team point oscarnight,2,0
4,birds could fly south america example argentina,1,1


In [92]:
from sklearn import metrics

print("Accuracy Score: ", metrics.accuracy_score(df_test['airline_sentiment'], df_test['airline_sentiment_predicted']))

Accuracy Score:  0.77125


In [93]:
print("Confusion Matrix: ")
metrics.confusion_matrix(df_test['airline_sentiment'], df_test['airline_sentiment_predicted'])

Confusion Matrix: 


array([[2425,   60,   23],
       [ 487,  310,   54],
       [ 249,   42,  350]], dtype=int64)