In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

#algoritmos
from sklearn.svm import LinearSVC, SVC

import seaborn as sns

In [None]:
def clean_str(string):
    string = re.sub(r"\n", "", string)
    string = re.sub(r"\r", "", string)
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    
    return string.strip().lower()

In [None]:
df = pd.read_csv("../input/topics.csv")

In [None]:
df.head(5)

In [None]:
sns.countplot(y="question_topic", data=df)

In [None]:
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df['question_topic'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    #('clf', SVC(kernel="rbf", gamma=0.9, C=4.0))
    ('clf', LinearSVC(C=2.0))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

c_matrix = confusion_matrix(pred, y_test)

sns.set(style="ticks", color_codes=True, rc={"figure.figsize": (12, 8)}, font_scale=1.2)
sns.heatmap(c_matrix, annot=True, annot_kws={"size": 12})

In [None]:
accuracy_score(y_test, pred)

In [None]:
question = "Give me an discount"
model.predict([question])[0]

In [None]:
from sklearn.externals import joblib
joblib.dump(model, "model.pkl")

In [None]:
model_imported = joblib.load("model.pkl")

In [None]:
question = "Do you have this shirt in red?"
model_imported.predict([question])[0]