In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train_twitter.csv",delimiter=',')

In [3]:
train.isna().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

In [4]:
test = pd.read_csv("test_twitter.csv",delimiter=',')
test.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


# Cleaning the data

In [5]:
#removing all the columns with many NAs
del train["airline_sentiment_gold"]
del train["negativereason_gold"]
del train["tweet_coord"]
del train["tweet_location"]
del train["user_timezone"]

In [6]:
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,name,retweet_count,text,tweet_created
0,567900433542488064,negative,Southwest,ColeyGirouard,0,"@SouthwestAir I am scheduled for the morning, ...",2015-02-17 20:16:29 -0800
1,569989168903819264,positive,Southwest,WalterFaddoul,0,@SouthwestAir seeing your workers time in and ...,2015-02-23 14:36:22 -0800
2,568089179520954368,positive,United,LocalKyle,0,@united Flew ORD to Miami and back and had gr...,2015-02-18 08:46:29 -0800
3,568928195581513728,negative,Southwest,amccarthy19,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,2015-02-20 16:20:26 -0800
4,568594180014014464,negative,United,J_Okayy,0,@united so our flight into ORD was delayed bec...,2015-02-19 18:13:11 -0800


In [7]:
#removing the redundant data
#like tweet_id, airline, tweet created, retweet count, name.
#they are not important to determine the sentiments
del train["tweet_id"]
del train["airline"]
del train["tweet_created"]
del train["name"]
del train["retweet_count"]

In [8]:
train_doc = train["text"].values
test_doc = test["text"].values
train_result = train["airline_sentiment"].values

# Cleaning the text

In [9]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
punc = list(string.punctuation)
stops.update(punc)
len(stops)

211

In [10]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()    

In [12]:
def clean_text(text):
    words = word_tokenize(text)
    output = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w.lower(),get_simple_pos(pos[0][1]))
            output.append(clean_word)
    cleaned_review = " ".join(output)
    return cleaned_review

In [13]:
xtrain = [clean_text(train_doc[i]) for i in range(len(train_doc))]
xtrain[0]

'southwestair scheduled morning 2 days fact yes..not sure evening flight one cancel flightled'

In [14]:
xtrain[0:5]

['southwestair scheduled morning 2 days fact yes..not sure evening flight one cancel flightled',
 'southwestair seeing worker time time going beyond love flying guy thank',
 'united flew ord miami back great crew service leg thanks',
 "southwestair dultch97 's horse radish 😤🐴",
 'united flight ord delayed air force one last flight sbn 8:20 5 min landed']

In [15]:
#removing emojis from the text
import emoji
def give_emoji_free_text(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])

    return clean_text


In [16]:
xtrain = [give_emoji_free_text(xtrain[i]) for i in range(len(xtrain))]
xtrain[0:5]

['southwestair scheduled morning 2 days fact yes..not sure evening flight one cancel flightled',
 'southwestair seeing worker time time going beyond love flying guy thank',
 'united flew ord miami back great crew service leg thanks',
 "southwestair dultch97 's horse radish",
 'united flight ord delayed air force one last flight sbn 8:20 5 min landed']

# Working on the text

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
#count_vec = CountVectorizer(max_features = 1000,max_df=0.95,min_df=0.05)
#count_vec = CountVectorizer(max_features = 1000)
count_vec = CountVectorizer(max_features = 1000,max_df=0.98,min_df=0.03,ngram_range=(1,3))
#count_vec = CountVectorizer(max_features = 2000,ngram_range=(1,3))

In [59]:
x_train_features = count_vec.fit_transform(xtrain)
x_train_features

<10980x44 sparse matrix of type '<class 'numpy.int64'>'
	with 33365 stored elements in Compressed Sparse Row format>

In [34]:
from sklearn.model_selection import train_test_split
xtrain_sample,xtest_sample,ytrain_sample,ytest_sample = train_test_split(xtrain,train_result,random_state = 0)

In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC()
grid = {'C' : [1e2, 1e3, 5e3, 1e4, 5e4, 1e5],
       'gamma' : [1e-3, 5e-4, 1e-4, 5e-3]}
abc = GridSearchCV(svc, grid)
abc.fit(x_train_features, train_result)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [100.0, 1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'gamma': [0.001, 0.0005, 0.0001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
abc.best_estimator_

SVC(C=50000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
xtest = [clean_text(test_doc[i]) for i in range(len(test_doc))]
xtest[0:5]

["americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2",
 'americanair plane ’ land identical worse condition grk according metars',
 "southwestair ca n't believe many pay customer left high dry reason flight cancel flightlations monday bdl wow",
 'usairways legitimately say would rather driven cross country flown u airway',
 'americanair still response aa great job guy']

In [36]:
xtest = [give_emoji_free_text(xtest[i]) for i in range(len(xtest))]
xtest[0:5]

["americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2",
 'americanair plane ’ land identical worse condition grk according metars',
 "southwestair ca n't believe many pay customer left high dry reason flight cancel flightlations monday bdl wow",
 'usairways legitimately say would rather driven cross country flown u airway',
 'americanair still response aa great job guy']

In [60]:
x_test_features = count_vec.transform(xtest)
x_test_features

<3660x44 sparse matrix of type '<class 'numpy.int64'>'
	with 11044 stored elements in Compressed Sparse Row format>

In [61]:
from sklearn.svm import SVC
svc = SVC(C=100,gamma=0.008)
svc.fit(x_train_features,train_result)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.008, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [62]:
y_pred = svc.predict(x_test_features)

In [63]:
np.savetxt("pred.csv",y_pred,delimiter=',',fmt='%s')