In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns # used to plot interactive graph.
from sklearn.metrics import f1_score, confusion_matrix  # evaluate models

# tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

# NLP
import nltk


In [None]:
df_test = pd.read_csv('/kaggle/input/question-classification-android-or-ios/test.csv')
df_train = pd.read_csv('/kaggle/input/question-classification-android-or-ios/train.csv')
df_valid = pd.read_csv('/kaggle/input/question-classification-android-or-ios/valid.csv')

In [None]:
print(df_train.shape)
df_train.head()

## ***Very* Basic Data Viz**

In [None]:
print(df_valid.shape)
df_valid.head()

In [None]:
print(df_test.shape)
df_test.head(10)

In [None]:
df_train.groupby(['LabelNum']).mean()

In [None]:
df_train.describe()

# **Using NLTK to perform NLP** (on just the titles)

### **Tokenize**

In [None]:
nltk.download('punkt')                # this is a tokenizer
nltk.download('wordnet')                    # lexical database (determine base word)
nltk.download('averaged_perceptron_tagger'); # context of a word
nltk.download('stopwords'); # stopwords

In [None]:


def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token) # remove tagging of users
        token = re.sub("(<\/?\w*>)", "", token) # remove html

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
# stopwords to be parsed into function `remove_noise` defined above 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

all_tokens = df_train.apply(lambda row: nltk.word_tokenize(row['Title']), axis=1)


In [None]:
cleaned_tokens = list()
for tokens in all_tokens:
    cleaned_tokens.append(remove_noise(tokens, stop_words))
    
df_train['cleaned_tokenized_titles'] = cleaned_tokens

# Lets look at our tokens

In [None]:
df_train.cleaned_tokenized_titles[df_train.LabelNum == 0]

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
            
from nltk import FreqDist

freq_dist_apple = FreqDist(get_all_words(df_train.cleaned_tokenized_titles[df_train.LabelNum == 1].values))
freq_dist_android = FreqDist(get_all_words(df_train.cleaned_tokenized_titles[df_train.LabelNum == 0].values))

In [None]:
freq_dist_apple.most_common(10)

In [None]:
freq_dist_android.most_common(10)

# Naive Bayes Classification Model

In [None]:
from sklearn.model_selection import train_test_split
import random

def prep_tokens_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)


# NLTK requires the data in this format:
android_data = [(title, 'Android') for title in prep_tokens_for_model(df_train.cleaned_tokenized_titles.values[df_train.LabelNum == 0])]
apple_data = [(title, 'Apple') for title in prep_tokens_for_model(df_train.cleaned_tokenized_titles[df_train.LabelNum == 1])]

X_train = android_data + apple_data
random.shuffle(X_train)


In [None]:
from nltk import classify
from nltk import NaiveBayesClassifier

clf = NaiveBayesClassifier.train(X_train)

print(clf.show_most_informative_features(10))

In [None]:
testing = [i[0] for i in X_train] # removing y_test, the correct label
y_test = [i[1] for i in X_train]  # saving y_test to evaluate the classifications

# making predictions
y_preds = list()
for test in testing:
    y_preds.append(clf.classify(test))

In [None]:
print("y_preds length", len(y_preds))
print(y_preds[:2])
print("y_test length", len(y_test))
print(y_test[:2])

In [None]:

# evaluating by f1_score

f1_score(y_test, y_preds, labels=['Android', 'Apple'], pos_label='Apple')
confusion_matrix(y_test, y_preds, labels=['Android', 'Apple'], normalize='true')

In [None]:

print("Accuracy is: ", classify.accuracy(clf, X_train))


# **Evaluating Model on `test.csv`:**

In [None]:
def prep_for_model(df):
    
    apple = df_test.Title[df_test.LabelNum == 1].copy()
    android = df_test.Title[df_test.LabelNum == 0].copy()
    
    apple_tokens = [nltk.word_tokenize(app) for app in apple.values]
    android_tokens = [nltk.word_tokenize(andr) for andr in android.values]
    
    apple_cleaned_tokens = list()
    android_cleaned_tokens = list()
    
    for tokens in apple_tokens:
        apple_cleaned_tokens.append(remove_noise(tokens, stop_words))
        
    for tokens in android_tokens:
        android_cleaned_tokens.append(remove_noise(tokens, stop_words))
    
    apple_tokens_for_model = prep_tokens_for_model(apple_cleaned_tokens)
    android_tokens_for_model = prep_tokens_for_model(android_cleaned_tokens)
    
    data_android = [(title, "Android")
                         for title in android_tokens_for_model]

    data_apple = [(title, "Apple")
                         for title in apple_tokens_for_model]
    
    X = data_android + data_apple
    
    random.shuffle(X)
    
    return X

In [None]:
X = prep_for_model(df_test)

X_test = [i[0] for i in X] # removing y_test, the correct label
y_test = [i[1] for i in X]  # saving y_test to evaluate the classifications

# making predictions
y_preds = list()
for test in X_test:
    y_preds.append(clf.classify(test))
    

print("F1 score is: ", f1_score(y_test, y_preds, labels=['Android', 'Apple'], pos_label='Apple'))
print(confusion_matrix(y_test, y_preds, labels=['Android', 'Apple'], normalize='true'))

print("Accuracy is: ", classify.accuracy(clf, X))