In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
#Reading data
df = pd.read_csv("dataset.csv")

In [None]:
df.head()

In [None]:
#labeling it according to its categories
df['category_id'] = df['category'].factorize()[0]

In [None]:
#Drop Duplicate data
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_id_df

In [None]:
#Creating dictionary 
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
id_to_category

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#use this link to see documentation and hyperparameters
#change according to your need
vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2',
                             encoding='latin-1', ngram_range=(1, 2),
                             stop_words='english')

In [None]:
features = vectorizer.fit_transform(df.question).toarray()
labels = df.category_id
features.shape

In [None]:
#this will help you to find correlation in words according to its categories
#this is just for analysis
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html
print("-----Printing Top 5 Correlated Words-----")
for category, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print(category)
    print("\nUnigrams:",unigrams[-5:])
    print("\nBigrams:",bigrams[-5:])

In [None]:
#split your data
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['category'], random_state = 0)

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
Vector_transformer = TfidfTransformer()
X_train_trans = Vector_transformer.fit_transform(X_train_counts)

In [None]:
#Classifiers
#choose according to accuracy

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
classifiers = [RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
          LinearSVC(),
          MultinomialNB(),
          LogisticRegression(random_state=0)]

In [None]:
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(classifiers)))
entries = []

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
for model in classifiers:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy',cv=CV)
    for i, accuracy in enumerate(accuracies):
        entries.append((model_name, i, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'i', 'accuracy'])

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
#select Your model which is having highest score

In [None]:
model = #model
X_train, X_test, y_train, y_test= train_test_split(features, labels,test_size=0.3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)