# Linguistic Feature Detection Model

## 1.Importing Dependencies

In [2]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import json
import features
import nltk

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Input data

In [3]:
with open("../Data/TrainingSet") as fh:
    data = json.load(fh)
df = pd.DataFrame(data)
df.columns = ["review_id", "hotel_name", "review", "polarity", "spam"]

## 3.Extracting linguistic features

In [4]:
iterator = 0
df["num_of_words"] = np.nan
df["avg_words_per_sent"]=np.nan
df["unique_words"]=np.nan
df["self_words"]=np.nan
df["brand"]=np.nan
df["avg_word_length"]=np.nan
df["connectors"]=np.nan
df["digits"]=np.nan
df["verbs_per_noun"]=np.nan
df["adj"]=np.nan
df["prep"]=np.nan
df["adverb"]=np.nan
#Iterating through whole data-set to add linguistic features
for review in df["review"]:
    a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12 = features.majorfunc(review)
    df.iloc[iterator, df.columns.get_loc('num_of_words')] = a1
    df.iloc[iterator, df.columns.get_loc('avg_words_per_sent')] = a2
    df.iloc[iterator, df.columns.get_loc('unique_words')] = a3
    df.iloc[iterator, df.columns.get_loc('self_words')] = a4
    df.iloc[iterator, df.columns.get_loc('brand')] = a5
    df.iloc[iterator, df.columns.get_loc('avg_word_length')] = a6
    df.iloc[iterator, df.columns.get_loc('connectors')] = a7
    df.iloc[iterator, df.columns.get_loc('digits')] = a8
    df.iloc[iterator, df.columns.get_loc('verbs_per_noun')] = a9
    df.iloc[iterator, df.columns.get_loc('adj')] = a10
    df.iloc[iterator, df.columns.get_loc('prep')] = a11
    df.iloc[iterator, df.columns.get_loc('adverb')] = a12
    iterator += 1    

## 4.Encoding categorical data

In [5]:
#Label Encoding
df["hotel_name"] = le.fit_transform(df["hotel_name"].astype('str'))
#One-hot encoding
just_dummies = pd.get_dummies(df['hotel_name'])
#Adding the encoded feature vectors and removing the categorical feature column
df = pd.concat([df, just_dummies], axis=1)      
df.drop(['hotel_name'], inplace=True, axis=1)

## 5. Removing useless columns

In [6]:
df.drop(['review_id'],axis=1,inplace=True)

## 6. Resulting data-set

In [7]:
print(df.head())

                                              review  polarity  spam  \
0  Fairmont Chicago was a great choice for my wif...         1     1   
1  Conrad Chicago it was 5:00 AM my plan just fle...         1     1   
2  My husband and I snagged a great deal on a wee...         1     1   
3  The Hilton in Chicago was awesome. The room wa...         1     1   
4  My husband and I stayed at the Hyatt Regency w...         1     1   

   num_of_words  avg_words_per_sent  unique_words  self_words  brand  \
0          73.0           10.571429          53.0    0.073171    0.0   
1         351.0           23.466667         181.0    0.074380    0.0   
2         135.0           17.000000          94.0    0.046667    0.0   
3         104.0            8.750000          61.0    0.058824    0.0   
4         114.0           14.375000          80.0    0.072581    0.0   

   avg_word_length  connectors ...  10  11  12  13  14  15  16  17  18  19  
0         4.451220         3.0 ...   0   0   0   0   0   

## 6. Function to word tokenize and remove stop-words

In [8]:
def preprocess(mess):
    nopunc = [c for c in mess if c not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

## 7. Dividing data-set to train set and test-set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'spam'], 
                                                    df['spam'], test_size=0.2,random_state=1)
print(X_train.shape, X_test.shape,y_train.shape)
y_train = y_train.to_frame()
y_test = y_test.to_frame()
print(type(y_test))

(1584, 34) (396, 34) (1584,)
<class 'pandas.core.frame.DataFrame'>


## 8. Transforming review text data to numerical form using TF-IDF vectorizer

In [11]:
vectorizer = TfidfVectorizer(min_df=4, max_df=0.7, sublinear_tf=True, use_idf=True,stop_words='english')
train_tfidf = vectorizer.fit_transform(X_train['review']).todense()
test_tfidf = vectorizer.transform(X_test['review']).todense()
print(train_tfidf.shape)
print(X_train.shape)
X_train.drop(['review'],axis=1,inplace=True)
X_test.drop(['review'],axis=1,inplace=True)

(1584, 2859)
(1584, 34)


In [12]:
print(train_tfidf.shape)
print(X_train.shape)

(1584, 2859)
(1584, 33)


In [13]:
train_tfidf = pd.DataFrame(train_tfidf)
print(type(X_train))
print(type(train_tfidf))
test_tfidf = pd.DataFrame(test_tfidf)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [13]:
#X_train = pd.concat([X_train,train_tfidf],axis=1)
#X_test  = pd.concat([X_test ,test_tfidf ],axis=1)
#print(X_train.shape)

(1893, 2892)


In [14]:
print(X_train.shape)
print(y_train.shape)

(1584, 33)
(1584, 1)


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
svc = svm.SVC()
parameters = {'kernel':('linear','rbf'), 'C':[1,10]}
clf=GridSearchCV(svc,parameters,cv=10)
model = clf.fit(X_train,y_train.values.ravel())
print ("Score:", model.score(X_test, y_test.values.ravel()))

Score: 0.7297979797979798


In [16]:
%matplotlib inline
corr = rets.corr()
plt.figure(figsize=(10, 10))
plt.imshow(corr, cmap='RdYlGn', interpolation='none', aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation='vertical')
plt.yticks(range(len(corr)), corr.columns);
plt.suptitle('Stock Correlations Heat Map', fontsize=15, fontweight='bold')
plt.show()

NameError: name 'rets' is not defined