In [7]:
import pandas as pd
import spacy
import numpy as np
import re
pd.set_option('display.max_colwidth',200)
nlp = spacy.load("en_core_web_sm",disable=["tagger","parser","ner"])

In [8]:
df = pd.read_csv("tweet.csv")

In [9]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [10]:
df["airline"].value_counts()

airline
United            3822
US Airways        2913
American          2759
Southwest         2420
Delta             2222
Virgin America     504
Name: count, dtype: int64

In [11]:
df.shape

(14640, 15)

In [12]:
df["text"].sample(5)

12256            @AmericanAir Is large Wichita Falls Airport not receiving any arrivals? I have called and no one answers. I been waiting for 1 day
2236     @united Nope! I'm on UA 174 that was supposed to leave at 6:47 but are still at the gate. Apparently we are about to pull back and de-ice.
10678                           @USAirways I'm dumping this dividend miles card and forgetting your airline exists #ridiculous #theworstairlineever
10350                                                               @USAirways yes - tells me the only way to mod plans is to call the 4322 number.
7481                                                                           @JetBlue Start including PTO in your getaway packages and I'm all in
Name: text, dtype: object

In [13]:
df["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [14]:
df[["airline_sentiment","airline"]].value_counts()

airline_sentiment  airline       
negative           United            2633
                   US Airways        2263
                   American          1960
                   Southwest         1186
                   Delta              955
neutral            Delta              723
                   United             697
                   Southwest          664
positive           Southwest          570
                   Delta              544
                   United             492
neutral            American           463
                   US Airways         381
positive           American           336
                   US Airways         269
negative           Virgin America     181
neutral            Virgin America     171
positive           Virgin America     152
Name: count, dtype: int64

In [17]:
def text_cleaner(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text)
    text = re.sub(r'http\S+','',text)
    text = text.lower()
    text = re.sub("[^a-z]+"," ",text)
    text = re.sub("[\s]+"," ",text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if(token.is_stop == False)]
    return " ".join(tokens)

In [19]:
df["clean_text"]=df["text"].apply(text_cleaner)



In [20]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,clean_text
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),said
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),plus ve added commercials experience tacky
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),didn t today mean need trip
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),s aggressive blast obnoxious entertainment guests faces amp little recourse
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),s big bad thing


In [21]:
text = df["clean_text"].values
labels = df["airline_sentiment"].values

In [22]:
text[:10]

array(['  said', '  plus ve added commercials experience tacky',
       '  didn t today mean need trip',
       '  s aggressive blast obnoxious entertainment guests faces amp little recourse',
       '  s big bad thing',
       '  seriously pay flight seats didn t playing s bad thing flying va',
       '  yes nearly time fly vx ear worm won t away',
       '  missed prime opportunity men hats parody', '  didn t d',
       '  amazing arrived hour early good'], dtype=object)

In [23]:
labels[:10]

array(['neutral', 'positive', 'neutral', 'negative', 'negative',
       'negative', 'positive', 'neutral', 'positive', 'positive'],
      dtype=object)

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(labels)

In [26]:
labels[:10]

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 2])

In [27]:
le.inverse_transform([0,1,2])

array(['negative', 'neutral', 'positive'], dtype=object)

In [28]:
labels[:10]

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 2])

In [29]:
from sklearn.model_selection import train_test_split

x_train,x_valid,y_train,y_valid = train_test_split(text,labels,stratify = labels,test_size = 0.2,random_state = 0,shuffle=True)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
word_vectorizer = TfidfVectorizer(max_features=1000)

In [32]:
word_vectorizer.fit(x_train)

In [33]:
word_vectors = word_vectorizer.transform(x_train)
word_vectors

<11712x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 65703 stored elements in Compressed Sparse Row format>

In [34]:
val_word_vectors = word_vectorizer.transform(x_valid)
val_word_vectors

<2928x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 16550 stored elements in Compressed Sparse Row format>

## model building

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [36]:
nb_model = MultinomialNB().fit(word_vectors,y_train)
nb_model

In [37]:
train_pred_nb = nb_model.predict(word_vectors)

In [39]:
f1_score(y_train,train_pred_nb,average = "weighted")

0.7303326366733179

In [41]:
from sklearn.linear_model import LogisticRegression

In [42]:
lr_model = LogisticRegression().fit(word_vectors,y_train)
lr_model

In [43]:
train_pred_lr = lr_model.predict(word_vectors)

In [45]:
f1_score(y_train,train_pred_lr,average="weighted")

0.8074180766755015

In [49]:
val_pred_lr = lr_model.predict(val_word_vectors)


In [50]:
f1_score(y_valid,val_pred_lr,average="weighted")

0.7530566533332674

In [53]:
def sentiment_analyzer(tweet):
    cleaned_tweet = text_cleaner(tweet)
    tweet_vector = word_vectorizer.transform([cleaned_tweet])
    label = lr_model.predict(tweet_vector)
    return le.inverse_transform(np.array(label))

In [54]:
sentiment_analyzer(df["text"][0])

array(['negative'], dtype=object)