In [1]:
import nltk
import pandas as pd
tweets = pd.read_csv(r"C:\Users\SURABHI\PycharmProjects\Tweets.csv")
list(tweets.columns.values)
len(tweets)

14640

In [2]:
sentiment_counts = tweets.airline_sentiment.value_counts()  #Series.value_counts returns objects containg count of unique values.o/p is in descending order.
number_of_tweets = tweets.tweet_id.count()  
print(sentiment_counts)
tweets.text.head(5)

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: text, dtype: object

In [3]:

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",tweet) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas
normalizer("Here is text about an airline I like.")

['text', 'airline', 'like']

In [4]:
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
tweets['normalized_tweet'] = tweets.text.apply(normalizer)
tweets[['text','normalized_tweet']].head()

Unnamed: 0,text,normalized_tweet
0,@VirginAmerica What @dhepburn said.,"[dhepburn, said]"
1,@VirginAmerica plus you've added commercials to the experience... tacky.,"[added, commercial, experience, tacky]"
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[today, must, mean, need, take, another, trip]"
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]"
4,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]"


In [5]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
tweets['grams'] = tweets.normalized_tweet.apply(ngrams)
tweets[['grams']].head()

Unnamed: 0,grams
0,[dhepburn said]
1,"[added commercial, commercial experience, experience tacky, added commercial experience, commercial experience tacky]"
2,"[today must, must mean, mean need, need take, take another, another trip, today must mean, must mean need, mean need take, need take another, take another trip]"
3,"[really aggressive, aggressive blast, blast obnoxious, obnoxious entertainment, entertainment guest, guest face, face amp, amp little, little recourse, really aggressive blast, aggressive blast obnoxious, blast obnoxious entertainment, obnoxious entertainment guest, entertainment guest face, guest face amp, face amp little, amp little recourse]"
4,"[really big, big bad, bad thing, really big bad, big bad thing]"


In [6]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

tweets[(tweets.airline_sentiment == 'negative')][['grams']].apply(count_words)['grams'].most_common(20)

[('http co', 449),
 ('customer service', 438),
 ('cancelled flightled', 425),
 ('late flight', 215),
 ('cancelled flighted', 196),
 ('flight cancelled', 185),
 ('late flightr', 144),
 ('cancelled flight', 131),
 ('hold hour', 128),
 ('flightled flight', 123),
 ('flight cancelled flightled', 117),
 ('flight delayed', 115),
 ('cancelled flightled flight', 107),
 ('call back', 106),
 ('booking problem', 98),
 ('gate agent', 83),
 ('flight flight', 74),
 ('hour late', 69),
 ('delayed flight', 69),
 ('flight attendant', 60)]

In [7]:
tweets[(tweets.airline_sentiment == 'positive')][['grams']].apply(count_words)['grams'].most_common(20)

[('http co', 233),
 ('customer service', 91),
 ('flight attendant', 25),
 ('quick response', 19),
 ('great flight', 17),
 ('best airline', 16),
 ('great job', 16),
 ('great service', 16),
 ('gate agent', 16),
 ('booking problem', 15),
 ('thanks help', 15),
 ('thank much', 15),
 ('good work', 14),
 ('fleet fleek', 14),
 ('fleek http', 14),
 ('fleet fleek http', 14),
 ('fleek http co', 14),
 ('guy rock', 13),
 ('looking forward', 13),
 ('great customer', 12)]

In [8]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

vectorized_data = count_vectorizer.fit_transform(tweets.text)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]
targets = tweets.airline_sentiment.apply(sentiment2target)

In [9]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.3, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [10]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(data_train, targets_train)


clf.score(data_test, targets_test) 


0.7802823315118397

In [11]:
predict=clf.predict(data_test)
#clf.accuracy_score(predict,targets_test)

In [12]:
sentences = count_vectorizer.transform([
    "What a great airline, the trip was a pleasure!",
    "My issue was quickly resolved after calling customer support. Thanks!",
    "What the hell! My flight was cancelled again. This sucks!",
    "Service was awful. I'll never fly with you again.",
    "You losers lost my luggage. Never again!",
    "I have mixed feelings about airlines. I don't know what I think.",
    ""
])
clf.predict_proba(sentences)

array([[0.12338998, 0.06470563, 0.81190439],
       [0.12305412, 0.06915121, 0.80779467],
       [0.96374547, 0.02521347, 0.01104107],
       [0.90228529, 0.07836866, 0.01934605],
       [0.97783972, 0.01091269, 0.01124759],
       [0.50459283, 0.4534472 , 0.04195997],
       [0.2606491 , 0.5043371 , 0.23501381]])

In [13]:
import pandas as pd

In [14]:
df=pd.read_csv('testing_data.csv')

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,"RT @OGR_EN: Critics of #humanrights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…"
1,1,"Critics of #humanrights legalism are right to call for more “pragmatism,” but this must be contextual, looking for… https://t.co/1x5kxwHsT7"
2,2,"In Kenya, Guatemala and Brazil, courts have defied presidents and shaken up politics—is court-centric advocacy one… https://t.co/AimbDuG3bw"
3,3,"RT @OGR_EN: Critics of human rights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…"
4,4,"RT @OGR_EN: Critics of human rights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…"


In [16]:
df.iloc[:,1]

0     RT @OGR_EN: Critics of #humanrights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…      
1     Critics of #humanrights legalism are right to call for more “pragmatism,” but this must be contextual, looking for… https://t.co/1x5kxwHsT7       
2     In Kenya, Guatemala and Brazil, courts have defied presidents and shaken up politics—is court-centric advocacy one… https://t.co/AimbDuG3bw       
3     RT @OGR_EN: Critics of human rights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…      
4     RT @OGR_EN: Critics of human rights legalism are right to call for more “pragmatism,” but this must be contextual, looking for advocacy hoo…      
5     Critics of human rights legalism are right to call for more “pragmatism,” but this must be contextual, looking for… https://t.co/IitQ3m9fkF       
6     Dustin Sharp makes a valuable @OGR_EN contribution to discussion of human ri

In [17]:
sen=[]
#for line in df.iloc[1]:
sen=(count_vectorizer.transform(df.iloc[:,1]))
out=clf.predict_proba(sen)