In [1]:
import pandas as pd

## Loading Data

In [2]:
train_data = pd.read_csv("x_y_train.csv")
test_data = pd.read_csv("x_test.csv")

In [3]:
## Training data
x_train_data = train_data["text"]
y_train_data = train_data["airline_sentiment"]

## Testing data
x_test_data = test_data["text"]

## Importing Stopwords

In [4]:
## Importing stopwords
from nltk.corpus import stopwords
stop = stopwords.words("english")

## Importing punctuations
import string
punctuations = string.punctuation

## Adding Punctuations to our stop words
stop += punctuations

## Train Test Split

In [5]:
## Splitting in the given training data for our training and testing
from sklearn.model_selection import train_test_split
x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train_data, y_train_data, 
                                                                            random_state = 0, test_size = 0.25)

## Count Vectoriser

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vec = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words=stop, 
                             analyzer='word', max_df = 0.8, lowercase = True, use_idf = True, smooth_idf = True)

In [7]:
## Fit transform the training data
train_features = tf_idf_vec.fit_transform(x_train_train)

## Only transform the testing data according to the features which was fit using x_train
test_features = tf_idf_vec.transform(x_train_test)

## Applying various classifiers

In [8]:
## Applying SVC
from sklearn.svm import SVC
svc = SVC()
svc.fit(train_features, y_train_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
svc.score(test_features, y_train_test)

0.63315118397085612

In [10]:
## Applying Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_features, y_train_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
rf.score(test_features, y_train_test)

0.72386156648451727

In [12]:
## Applying Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
bayes = MultinomialNB(alpha=0.4)
bayes.fit(train_features, y_train_train)

MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)

In [13]:
bayes.score(test_features, y_train_test)

0.75555555555555554

## Applying Grid Search CV

In [14]:
from sklearn.model_selection import GridSearchCV
clf = MultinomialNB()
grid = {"alpha" :[0.1,0.2,0.3,0.4,0.5,0.6,0.7]}
abc = GridSearchCV(clf, grid) 
abc.fit(test_features, y_train_test)

GridSearchCV(cv=None, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
abc.best_estimator_

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

## Prediction

In [16]:
pred_features = tf_idf_vec.transform(x_test_data)
y_pred = bayes.predict(pred_features)

In [17]:
import numpy as np
np.savetxt("output.csv", y_pred, fmt='%s')