# Text Classification
> Text classification is one of the most important tasks in Natural Language Processing. It is the process of classifying text strings or documents into different categories, depending upon the contents of the strings. Text classification has a variety of applications, such as detecting user sentiment from a tweet, classifying an email as spam or ham, classifying blog posts into different categories, automatic tagging of customer queries, and so on.

In [15]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Text,Intent
0,How hot is it today?,temperature
1,Is it hot outside?,temperature
2,Will it be uncomfortably hot?,temperature
3,Will it be sweltering?,temperature
4,How cold is it today?,temperature


# Feature Extraction

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
x = df_data['Text']
y = df_data['Intent']
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 1)


In [19]:
# Sample Train Text
yTrain.shape

(40,)

In [32]:
# Sample lables
yTest[:5]

28     conditions
11    temperature
10    temperature
41     conditions
2     temperature
Name: Intent, dtype: object

In [21]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(xTrain)
vectors.shape

(40, 57)

In [22]:
# Machine learning
from sklearn.svm import SVC
clf_svm = SVC(kernel ='rbf', gamma='auto') # Poor accuracy 
# clf_svm = SVC(kernel ='rbf', gamma=1) # Better accuracy
# Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
clf_svm.fit(vectors,yTrain)
print("Train Done")
print("Train accracy = ",clf_svm.score(vectors, yTrain))

Train Done
Train accracy =  0.55


In [23]:
from sklearn.naive_bayes import MultinomialNB
clf_naive = MultinomialNB(alpha=.01)
clf_naive.fit(vectors, yTrain)
# pred = clf.predict(vectors_test)
print("\nTrain Done")
print("Train accracy = ",clf_naive.score(vectors, yTrain))


Train Done
Train accracy =  1.0


In [24]:
from sklearn import metrics
vectors_test = vectorizer.transform(xTest)

print("SVM Test accracy = ",clf_svm.score(vectors_test, yTest))
print("Naive bayes Test accracy = ",clf_naive.score(vectors_test, yTest))

SVM Test accracy =  0.2
Naive bayes Test accracy =  0.7


# Lets test our model by feeding a query

In [25]:
vectors_test = vectorizer.transform(["How hot is it today?"])
clf_svm.predict(vectors_test)[0]

'temperature'

In [26]:
vectors_test = vectorizer.transform(["How hot is it today?"])
clf_naive.predict(vectors_test)[0]

'temperature'

# Need of better word embeddings
* Not able to understand unseend words like "Gloomy" while "Sunny" is understood.
* Reason Gloomy word itself is not present in the vocab

In [27]:
vectors_test = vectorizer.transform(["Is it sunny now?"])
clf_naive.predict(vectors_test)[0]

'conditions'

In [30]:
vectors_test = vectorizer.transform(["Is it gloomy now?", "how hot it is?"])
# incorrect prediction
clf_naive.predict(vectors_test)

array(['temperature', 'temperature'], dtype='<U11')