In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.feature_extraction.text import CountVectorizer
# load the data
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/lkyin/ECS189L/main/Tweets.csv")

In [None]:
def clean_data(text):
  text = text.split()
  #other_stop = ["flights","flight", "get", "plane"]
  for i in range(len(text)):
    if text[i]!='':
      text[i] = text[i].lower()
      text[i] = text[i].strip()
      if text[i][0]=="@" or text[i] in stopwords.words('english'):
        text[i] = ""
      text[i] = re.sub(r'[^\w\s]', '', text[i])
      text[i] = re.sub('\d+', '', text[i])
      '''
      for stop in other_stop:
        text[i] = text[i].replace(stop, '')
      '''
  text2 = " ".join(text)
  return text2

df['text'] = df['text'].apply(lambda x:clean_data(str(x)))

In [None]:
pos_df = pd.DataFrame(df[df['airline_sentiment']=="positive"]['text'])
neg_df = pd.DataFrame(df[df['airline_sentiment']=="negative"]['text'])
neut_df = pd.DataFrame(df[df['airline_sentiment']=="neutral"]['text'])
print("top 10 most frequent words in positive sentiment class: ")
print(pos_df.text.str.split(expand=True).stack().value_counts()[:10])
print("top 10 most frequent words in negative sentiment class: ")
print(neg_df.text.str.split(expand=True).stack().value_counts()[:10])
print("top 10 most frequent words in neutral sentiment class: ")
print(neut_df.text.str.split(expand=True).stack().value_counts()[:10])

top 10 most frequent words in positive sentiment class: 
thanks      609
thank       453
flight      375
you         261
great       233
service     160
love        132
get         114
customer    113
guys        111
dtype: int64
top 10 most frequent words in negative sentiment class: 
flight       2918
get           984
cancelled     920
service       742
hours         653
help          610
hold          608
customer      604
time          584
im            547
dtype: int64
top 10 most frequent words in neutral sentiment class: 
flight     602
get        238
please     179
flights    167
help       163
need       163
thanks     154
im         136
would      122
dm         121
dtype: int64


In [None]:
df['text'] = df['text'].apply(lambda x:word_tokenize(x))
ps = PorterStemmer()
for ind, row in enumerate(df['text']):
  stem_dict = {}
  for i in range(len(row)):
    new_stem = ps.stem(row[i])
    if new_stem in stem_dict:
      continue
    else:
      stem_dict[new_stem] = True
  row_list = list(stem_dict.keys())
  df['text'].iloc[ind] = " ".join(row_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
vectorizer = CountVectorizer()
tweet_list = df['text'].tolist()
X = vectorizer.fit_transform(tweet_list)
train_X, test_X, train_Y, test_Y = train_test_split(X,df['airline_sentiment'].tolist(), test_size=0.2, random_state = 17) #seed=8, 17 best
params = {"max_depth" : range(1,200, 10)}
clf = GridSearchCV(DecisionTreeClassifier(), params,n_jobs=-1,scoring="accuracy", cv=4)
clf.fit(train_X, train_Y)
clf = clf.best_estimator_
print("Accuracy of the best model on test data: ",clf.score(test_X, test_Y))

Accuracy of the best model on test data:  0.7223360655737705


In [None]:
pos_df = pd.DataFrame(df[df['airline_sentiment']=="positive"])
neg_df = pd.DataFrame(df[df['airline_sentiment']=="negative"])
neut_df = pd.DataFrame(df[df['airline_sentiment']=="neutral"])
pos_airlines = pos_df['airline'].value_counts().keys().tolist()
pos_counts = pos_df['airline'].value_counts().tolist()
neg_airlines = neg_df['airline'].value_counts().keys().tolist()
neg_counts = neg_df['airline'].value_counts().tolist()
tot_airlines = df['airline'].value_counts().keys().tolist()
tot_counts = df['airline'].value_counts().tolist()
pos_dict = {}
neg_dict = {}
for i in range(len(tot_airlines)):
  ind_pos = pos_airlines.index(tot_airlines[i])
  ind_neg = neg_airlines.index(tot_airlines[i])
  pos_frac = pos_counts[ind_pos]/tot_counts[i]
  neg_frac = neg_counts[ind_pos]/tot_counts[i]
  pos_dict[tot_airlines[i]] = pos_frac
  neg_dict[tot_airlines[i]] = neg_frac

print("Ranking of airline based on fraction of positive tweets: ",sorted(pos_dict, key=lambda x: pos_dict[x], reverse=True)[:3])
print("Ranking of airline based on fraction of negative tweets: ",sorted(neg_dict, key=lambda x: neg_dict[x], reverse=True)[:3])

Ranking of airline based on fraction of positive tweets:  ['Virgin America', 'Delta', 'Southwest']
Ranking of airline based on fraction of negative tweets:  ['Southwest', 'Delta', 'United']
