In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [28]:
train = pd.read_csv("tweets.csv")
test = pd.read_csv("twite.csv")

In [29]:
train.head()
train.count()

tweet_id                        14640
airline_sentiment               14640
airline_sentiment_confidence    14640
negativereason                   9178
negativereason_confidence       10522
airline                         14640
airline_sentiment_gold             40
name                            14640
negativereason_gold                32
retweet_count                   14640
text                            14640
tweet_coord                      1019
tweet_created                   14640
tweet_location                   9907
user_timezone                    9820
dtype: int64

In [30]:
# Dropping Useless Columns

train.drop('tweet_id', inplace = True, axis = 1)
train.drop('airline_sentiment_gold', inplace = True, axis = 1)
train.drop('negativereason_gold', inplace = True, axis = 1)
train.drop('tweet_coord', inplace = True, axis = 1)
train.drop('tweet_created',inplace = True, axis = 1)

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 10 columns):
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
name                            14640 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 1.1+ MB


In [32]:
train.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_location,user_timezone
0,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,,Eastern Time (US & Canada)
1,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,,Pacific Time (US & Canada)
2,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,Lets Play,Central Time (US & Canada)
3,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,,Pacific Time (US & Canada)
4,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,,Pacific Time (US & Canada)


In [33]:
mood_count = train['airline_sentiment'].value_counts()
mood_count

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [34]:
train['airline'].value_counts()

United            3822
US Airways        2913
American          2759
Southwest         2420
Delta             2222
Virgin America     504
Name: airline, dtype: int64

In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['airline_sentiment'] = le.fit_transform(train['airline_sentiment'])
train.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_location,user_timezone
0,1,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,,Eastern Time (US & Canada)
1,2,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,,Pacific Time (US & Canada)
2,1,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,Lets Play,Central Time (US & Canada)
3,0,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,,Pacific Time (US & Canada)
4,0,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,,Pacific Time (US & Canada)


In [36]:
import re
import nltk
from nltk.corpus import stopwords

In [37]:
def tweet_to_words(raw_tweet):
    letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words )) 

In [38]:
train['clean_tweet'] = train['text'].apply(lambda x: tweet_to_words(x))
test['clean_tweet'] = test['text'].apply(lambda x: tweet_to_words(x))

In [39]:
train_clean_tweet=[]
for tweet in train['clean_tweet']:
    train_clean_tweet.append(tweet)
    
test_clean_tweet=[]
for tweet in test['clean_tweet']:
    test_clean_tweet.append(tweet)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(analyzer = "word")
train_features= v.fit_transform(train_clean_tweet)
test_features= v.transform(test_clean_tweet)

In [41]:
train_features

<14640x13496 sparse matrix of type '<class 'numpy.int64'>'
	with 142711 stored elements in Compressed Sparse Row format>

In [42]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_features,train['airline_sentiment'])
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(nb.predict(train_features),train['airline_sentiment'])
print(accuracy)

0.8387295081967213


In [43]:
test_features

<3660x13496 sparse matrix of type '<class 'numpy.int64'>'
	with 35707 stored elements in Compressed Sparse Row format>

In [44]:
test_pred = nb.predict(test_features)
test_pred = le.inverse_transform(test_pred)
print(test_pred)
np.savetxt("test_pred.csv",test_pred,fmt='%s')

['negative' 'negative' 'negative' ... 'positive' 'positive' 'neutral']


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 350,max_depth=None)
train_features
clf.fit(train_features,train['airline_sentiment'])
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(clf.predict(train_features),train['airline_sentiment'])
print(accuracy)

In [None]:
test_pred = clf.predict(test_features)

In [None]:
test_pred = le.inverse_transform(test_pred)

In [None]:
np.savetxt("test_pred.csv",test_pred,fmt='%s')

In [None]:
print(test_pred)