In [13]:
import nltk
nltk.download('punkt')
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer, word_tokenize, sent_tokenize, TweetTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
from nltk.stem.porter import PorterStemmer 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#file import
#===========

df_train = pd.read_csv("../input/nlphw1/train.csv", encoding = "ISO-8859-1")
print(df_train) #.describe()

df_test = pd.read_csv("../input/nlphw1/test.csv", encoding = "ISO-8859-1")
print(df_test)

        Id                                               text  Target
0        1  @USAirways  ! THE WORST in customer service. @...      -1
1        2  @united call wait times are over 20 minutes an...      -1
2        3  @JetBlue what's up with the random delay on fl...      -1
3        4  @AmericanAir Good morning!  Wondering why my p...       0
4        5  @united UA 746. Pacific Rim and Date Night cut...      -1
...    ...                                                ...     ...
7315  7316                            @AmericanAir followback       0
7316  7317  @united thanks for the help. Wish the phone re...       1
7317  7318  @usairways the. Worst. Ever. #dca #customerser...      -1
7318  7319  @nrhodes85: look! Another apology. DO NOT FLY ...      -1
7319  7320  @united you are by far the worst airline. 4 pl...      -1

[7320 rows x 3 columns]
         id                                               text
0      7322  @AmericanAir In car gng to DFW. Pulled over 1h...
1      73

In [3]:
#pre-processing
#=================

df_train["text"]=df_train["text"].str.lower()  #convert to lower case
df_train["text"]=df_train["text"].str.replace('\d+','') # remove numbers
df_train["text"]=df_train["text"].str.replace('<.*?>','') # remove HTML tags
df_train["text"]=df_train["text"].str.replace('[^@\w\s]','') # remove punctuation  

df_test["text"]=df_test["text"].str.lower()  #convert to lower case
df_test["text"]=df_test["text"].str.replace('\d+','') # remove numbers
df_test["text"]=df_test["text"].str.replace('<.*?>','') # remove HTML tags
df_test["text"]=df_test["text"].str.replace('[^@\w\s]','') # remove punctuation  

print(df_train)
print(df_test)

        Id                                               text  Target
0        1  @usairways   the worst in customer service @us...      -1
1        2  @united call wait times are over  minutes and ...      -1
2        3  @jetblue whats up with the random delay on fli...      -1
3        4  @americanair good morning  wondering why my pr...       0
4        5  @united ua  pacific rim and date night cut out...      -1
...    ...                                                ...     ...
7315  7316                            @americanair followback       0
7316  7317  @united thanks for the help wish the phone rep...       1
7317  7318      @usairways the worst ever dca customerservice      -1
7318  7319  @nrhodes look another apology do not fly @usai...      -1
7319  7320  @united you are by far the worst airline  plan...      -1

[7320 rows x 3 columns]
         id                                               text
0      7322  @americanair in car gng to dfw pulled over hr ...
1      73

In [4]:
# stemming
#===========

st = SnowballStemmer('english')
df_train['text'] =df_train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
print(df_train)


df_test['text'] =df_test['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
print(df_test)

#lemmatizer = WordNetLemmatizer()
#df['Question']=df_train['text'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
#df

        Id                                               text  Target
0        1  @usairway the worst in custom servic @usairway...      -1
1        2  @unit call wait time are over minut and airpor...      -1
2        3  @jetblu what up with the random delay on fligh...      -1
3        4  @americanair good morn wonder whi my pretsa ch...       0
4        5  @unit ua pacif rim and date night cut out not ...      -1
...    ...                                                ...     ...
7315  7316                            @americanair followback       0
7316  7317  @unit thank for the help wish the phone rep co...       1
7317  7318        @usairway the worst ever dca customerservic      -1
7318  7319     @nrhode look anoth apolog do not fli @usairway      -1
7319  7320  @unit you are by far the worst airlin plane de...      -1

[7320 rows x 3 columns]
         id                                               text
0      7322  @americanair in car gng to dfw pull over hr ag...
1      73

In [5]:
# remove stopwords
#====================

stop = stopwords.words('english')
df_train['text'] = df_train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(df_train)

df_test['text'] = df_test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(df_test)

        Id                                               text  Target
0        1  @usairway worst custom servic @usairway call m...      -1
1        2  @unit call wait time minut airport wait time l...      -1
2        3   @jetblu random delay flight ani chanc fals alarm      -1
3        4  @americanair good morn wonder whi pretsa check...       0
4        5  @unit ua pacif rim date night cut constant ran...      -1
...    ...                                                ...     ...
7315  7316                            @americanair followback       0
7316  7317      @unit thank help wish phone rep could accomid       1
7317  7318            @usairway worst ever dca customerservic      -1
7318  7319            @nrhode look anoth apolog fli @usairway      -1
7319  7320  @unit far worst airlin plane delay round trip ...      -1

[7320 rows x 3 columns]
         id                                               text
0      7322  @americanair car gng dfw pull hr ago veri ici ...
1      73

In [6]:
#word tokenizer
#===============
# using tweetokenizer to remove @ handles. WordPunctTokenizer does not keep handles together.

tknzr = TweetTokenizer(strip_handles=True)
df_train['text']=df_train['text'].apply(lambda row: " ".join(tknzr.tokenize(row)))
print(df_train)

df_test['text']=df_test['text'].apply(lambda row: " ".join(tknzr.tokenize(row)))
print(df_test)

#sentence tokenizer
#===================

#sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
#df.apply(lambda row: nltk.word_tokenize(row['frases']), axis=1)
#df_train['text'].apply(lambda row: sent_segmenter.tokenize(row))

        Id                                               text  Target
0        1  worst custom servic call month book flight poo...      -1
1        2      call wait time minut airport wait time longer      -1
2        3           random delay flight ani chanc fals alarm      -1
3        4  good morn wonder whi pretsa check board pass morn       0
4        5  ua pacif rim date night cut constant random on...      -1
...    ...                                                ...     ...
7315  7316                                         followback       0
7316  7317            thank help wish phone rep could accomid       1
7317  7318                      worst ever dca customerservic      -1
7318  7319                              look anoth apolog fli      -1
7319  7320  far worst airlin plane delay round trip flight...      -1

[7320 rows x 3 columns]
         id                                               text
0      7322  car gng dfw pull hr ago veri ici road onhold a...
1      73

In [9]:
# Logistic Regression
#========================
pipe_LR_TF = Pipeline([('tfidfvectorizer', TfidfVectorizer()), ('logisticregression',LogisticRegression(solver='lbfgs'))]) 
param_grid_LR_TF = {'logisticregression__C': [0.001, 0.01, 0.1], "tfidfvectorizer__ngram_range": [(1, 1), (1, 2)]}
grid_LR_TF = GridSearchCV(pipe_LR_TF, param_grid_LR_TF, cv=5)
grid_LR_TF.fit(df_train['text'], df_train['Target'])
print("LR_TF - Best cross-validation score: {:.2f}".format(grid_LR_TF.best_score_))
print("LR_TF - Best cross-validation score: {0}\n".format(grid_LR_TF.best_params_))

pipe_LR_HV =Pipeline([('HashingVectorizer', HashingVectorizer()), ('logisticregression',LogisticRegression(solver='lbfgs'))])  
#make_pipeline(HashingVectorizer(stop_words='english'), LogisticRegression(solver='lbfgs'))
param_grid_LR_HV = {'logisticregression__C': [0.001, 0.01, 0.1], "HashingVectorizer__ngram_range": [(1, 1), (1, 2)]}
grid_LR_HV = GridSearchCV(pipe_LR_HV, param_grid_LR_HV, cv=5)
grid_LR_HV.fit(df_train['text'], df_train['Target'])
print("LR_HV - Best cross-validation score: {:.2f}".format(grid_LR_HV.best_score_))
print("LR_HV - Best cross-validation score: {0}\n".format(grid_LR_HV.best_params_))

LR_TF - Best cross-validation score: 0.68
LR_TF - Best cross-validation score: {'logisticregression__C': 0.1, 'tfidfvectorizer__ngram_range': (1, 1)}

LR_HV - Best cross-validation score: 0.70
LR_HV - Best cross-validation score: {'HashingVectorizer__ngram_range': (1, 1), 'logisticregression__C': 0.1}



In [None]:
# Naive Bayes requires todense() option
#===========================================
#pipe_GS_TF = Pipeline([('tfidfvectorizer', TfidfVectorizer()), ('GaussianNB', GaussianNB())]) 
#param_grid_GS_TF = {"tfidfvectorizer__ngram_range": [(1, 1), (1, 2)]}
#grid_GS_TF = GridSearchCV(pipe_GS_TF, param_grid_GS_TF, cv=5)
#grid_GS_TF.fit(df_train['text'], df_train['Target'])
#print("GS_TF - Best cross-validation score: {:.2f}".format(grid_GS_TF.best_score_))
#print("GS_TF - Best cross-validation score: {0}\n".format(grid_GS_TF.best_params_))

#pipe_GS_HV =Pipeline([('HashingVectorizer', HashingVectorizer()), ('GaussianNB', GaussianNB())])  
#make_pipeline(HashingVectorizer(stop_words='english'), LogisticRegression(solver='lbfgs'))
#param_grid_GS_HV = {"HashingVectorizer__ngram_range": [(1, 1), (1, 2)]}
#grid_GS_HV = GridSearchCV(pipe_GS_HV, param_grid_GS_HV, cv=5)
#grid_GS_HV.fit(df_train['text'], df_train['Target'])
#print("GS_HV - Best cross-validation score: {:.2f}".format(grid_GS_HV.best_score_))
#print("GS_HV - Best cross-validation score: {0}\n".format(grid_GS_HV.best_params_))


In [11]:
# XGBoost
#==========
pipe_XG_TF = Pipeline([('tfidfvectorizer', TfidfVectorizer()), ('XGBClassifier',XGBClassifier())]) 
param_grid_XG_TF = {"tfidfvectorizer__ngram_range": [(1, 1), (1, 2)], 'XGBClassifier__n_estimators': [25,50],'XGBClassifier__max_depth': [7,10], 'XGBClassifier__learning_rate': [0.01,0.001]}
grid_XG_TF = GridSearchCV(pipe_XG_TF, param_grid_XG_TF, cv=5)
grid_XG_TF.fit(df_train['text'], df_train['Target'])
print("XG_TF - Best cross-validation score: {:.2f}".format(grid_XG_TF.best_score_))
print("XG_TF - Best cross-validation score: {0}\n".format(grid_XG_TF.best_params_))

pipe_XG_HV =Pipeline([('HashingVectorizer', HashingVectorizer(n_features=20)), ('XGBClassifier',XGBClassifier())])  
#make_pipeline(HashingVectorizer(stop_words='english'), LogisticRegression(solver='lbfgs'))
param_grid_XG_HV = {"HashingVectorizer__ngram_range": [(1, 1), (1, 2)],'XGBClassifier__n_estimators': [25,50],'XGBClassifier__max_depth': [7,10], 'XGBClassifier__learning_rate': [0.01,0.001]}
grid_XG_HV = GridSearchCV(pipe_XG_HV, param_grid_XG_HV, cv=5)
grid_XG_HV.fit(df_train['text'], df_train['Target'])
print("XG_HV - Best cross-validation score: {:.2f}".format(grid_XG_HV.best_score_))
print("XG_HV - Best cross-validation score: {0}\n".format(grid_XG_HV.best_params_))

XG_TF - Best cross-validation score: 0.70
XG_TF - Best cross-validation score: {'XGBClassifier__learning_rate': 0.01, 'XGBClassifier__max_depth': 10, 'XGBClassifier__n_estimators': 25, 'tfidfvectorizer__ngram_range': (1, 1)}

XG_HV - Best cross-validation score: 0.67
XG_HV - Best cross-validation score: {'HashingVectorizer__ngram_range': (1, 1), 'XGBClassifier__learning_rate': 0.01, 'XGBClassifier__max_depth': 7, 'XGBClassifier__n_estimators': 50}



In [27]:
# Logistic regression with Hash vectorization, and XG Boost with Tfidf give cross validation accuracy of 70%. 
# Logistic regression with TF-IDF has 68% accuracy. 
#Evaluating predicted test results


#Logistic regression with Hash vectorization
#===========================================
hashvector = HashingVectorizer(ngram_range=(1,1))
train_vectors_LR_HV = hashvector.fit_transform(df_train['text']) 

test_vectors = hashvector.transform(df_test['text'])

model_LR_HV=LogisticRegression(solver='lbfgs', C=0.1)

model_LR_HV.fit(train_vectors_LR_HV,df_train['Target'])
y_pred_LR_HV = model_LR_HV.predict(test_vectors)

df_test['Target_LR_HV']=y_pred_LR_HV

#print(confusion_matrix(df_train['Target'], y_pred_LR_HV))


#XG Boost with Tfidf vectorization
#===================================
tfidf = TfidfVectorizer(ngram_range=(1,1))
train_vectors_XG_TF = tfidf.fit_transform(df_train['text'])  

test_vectors_XG_TF = tfidf.transform(df_test['text'])

model_XG_TF=XGBClassifier(learning_rate=0.01,max_depth=10,n_estimators=25,ngram_range=(1, 1))

model_XG_TF.fit(train_vectors_XG_TF,df_train['Target'])

y_pred_XG_TF = model_XG_TF.predict(test_vectors_XG_TF)

df_test['Target']=y_pred_XG_TF



Parameters: { ngram_range } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [None]:
df_test

In [None]:
# from the comparison, we see that the sentence (after stemming) with id = 7326 is classified as negative sentiment in 1st model 
# and classified as positive sentiment in 2nd model. So 1st model does a better job.

header = ["id", "Target"]
df_test.to_csv('submission.csv', columns = header, index=False, encoding='utf-8')
