In [1]:
# Natural Language Processing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
# Importing the dataset
dataset = pd.read_csv('ISEAR.csv', header = None).iloc[:, :2]

In [2]:
# Cleaning the texts
import re
import nltk
#nltk.download('stopwords')
#from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 7517):
    review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i,1])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    #review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 15000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

In [5]:
print(X.shape)
print(y.shape)

(7517, 6143)
(7517,)


In [6]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 5, 3, ..., 3, 4, 3])

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1500,n_jobs=-1,random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [10]:
inp = "I want to die".lower()
inp = ps.stem(inp)
inp = cv.transform([inp]).toarray()
res = classifier.predict(inp)
labelencoder_y.inverse_transform(res)

array(['guilt'], dtype=object)

In [11]:
inp = "I feel awesome".lower()
inp = ps.stem(inp)
inp = cv.transform([inp]).toarray()
res = classifier.predict(inp)
labelencoder_y.inverse_transform(res)

array(['joy'], dtype=object)

In [12]:
inp = "let me go".lower()
inp = ps.stem(inp)
inp = cv.transform([inp]).toarray()
res = classifier.predict(inp)
labelencoder_y.inverse_transform(res)

array(['anger'], dtype=object)

In [13]:
# Dump model
import pickle
filename = 'model.sav'
pickle.dump(classifier, open(filename, 'wb'))