# Natural Language Processing - Classification of Text

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")
df.head()

## EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.countplot(data=df, x="airline_sentiment");

In [None]:
plt.figure(figsize=(12, 4), dpi=150)
sns.countplot(x=df["negativereason"])
plt.xticks(rotation=45, horizontalalignment="right");

In [None]:
plt.figure(figsize=(12, 4), dpi=150)
sns.countplot(data=df, x="airline", hue="airline_sentiment");

## Vectorization process

In [None]:
data = df[["airline_sentiment", "text"]]
data.head()

In [None]:
X = df["text"]
y = df["airline_sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words="english")

In [None]:
tfidf.fit(Xtrain)

In [None]:
#print(tfidf.get_feature_names())

In [None]:
#print(tfidf.vocabulary_)

In [None]:
data[data["airline_sentiment"]=="negative"]["text"]

In [None]:
count_vect = CountVectorizer(stop_words="english")
neg_matrix = count_vect.fit_transform(data[data["airline_sentiment"]=="negative"]["text"])
freqs = zip(count_vect.get_feature_names(), neg_matrix.sum(axis=0).tolist()[0])
# Sort from largest to smallest
print(sorted(freqs, key=lambda x: -x[1])[:100])

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(data[data["airline_sentiment"]=="negative"]["text"]))

fig = plt.figure(figsize=(14,6))
plt.imshow(wordcloud)
plt.axis('off')
plt.show();

In [None]:
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(data[data["airline_sentiment"]=="positive"]["text"]))

fig = plt.figure(figsize=(14,6))
plt.imshow(wordcloud)
plt.axis('off')
plt.show();

In [None]:
Xtrain_tfidf = tfidf.transform(Xtrain)
Xvalid_tfidf = tfidf.transform(Xvalid)

In [None]:
Xtrain_tfidf.toarray().shape

## Create a model

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(Xtrain_tfidf, ytrain)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)
lr.fit(Xtrain_tfidf, ytrain)

In [None]:
from sklearn.svm import SVC, LinearSVC

rbf_svc = SVC()
rbf_svc.fit(Xtrain_tfidf, ytrain)

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(Xtrain_tfidf, ytrain)

### Evaluating models

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report

In [None]:
def report(model):
    preds = model.predict(Xvalid_tfidf)
    print(classification_report(yvalid, preds))
    plot_confusion_matrix(model, Xvalid_tfidf, yvalid)

In [None]:
report(nb)

In [None]:
report(lr)

In [None]:
report(rbf_svc)

In [None]:
report(linear_svc)

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("rbf_svc", SVC())
])

pipe.fit(X, y)

In [None]:
pipe.predict(["The flight went really quick"])

In [None]:
pipe.predict(["This flight wasn't very good experience"])

In [None]:
pipe.predict(["ok flight"])