# STATUS: FINAL

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test = pd.read_csv('/kaggle/input/question-classification-android-or-ios/test.csv')
train = pd.read_csv('/kaggle/input/question-classification-android-or-ios/train.csv')
valid = pd.read_csv('/kaggle/input/question-classification-android-or-ios/valid.csv')

In [None]:
train.head()

In [None]:
columns = train.columns
print(columns)

# The target column

The target column is **Label**. As the title suggests, we have two target labels that we are to classify into, android or ios.

The questions relating to android are much more compared to ios. More than double.

In [None]:
train['Label'].value_counts()

In [None]:
sns.countplot(data=train, x='Label')
plt.show()

In [None]:
android_sample = train[train['Label']=='android'].sample(n=14217, random_state=0)
ios_sample = train[train['Label']=='ios'].sample(n=14217, random_state=0)

In [None]:
balanced_dataset = android_sample.append(ios_sample, ignore_index=True)
balanced_dataset

# Some wordclouds

In [None]:
ios = np.array(Image.open('../input/logoss/1878_apple-logo.png'))
android = np.array(Image.open('../input/logoss/Android-logo.jpg'))
black_background = np.array(Image.open('../input/white-background/black.png'))

In [None]:
word_string=" ".join(train['Title'].str.lower())
wordcloud = WordCloud(stopwords=STOPWORDS, mask=black_background).generate(word_string)

plt.subplots(figsize=(15,15))
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
word_string=" ".join(train['Body'].str.lower())
wordcloud = WordCloud(stopwords=STOPWORDS, mask=black_background).generate(word_string)

plt.subplots(figsize=(15,15))
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
word_string=" ".join(train['Body'][train['Label'] == 'android'].str.lower())
wordcloud = WordCloud(stopwords=STOPWORDS, mask=android).generate(word_string)

plt.subplots(figsize=(15,15))
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
word_string=" ".join(train['Body'][train['Label'] == 'ios'].str.lower())
wordcloud = WordCloud(stopwords=STOPWORDS, mask=ios).generate(word_string)

plt.subplots(figsize=(15,15))
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# The lazy programmer's model

In [None]:
#X_train = train[['Title', 'Body']]
X_train = train['Body']
y_train = train['Label'] 

#X_test = test[['Title', 'Body']]
X_test = test['Body']
y_test = test['Label'] 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
    
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                         ('clf', LinearSVC(random_state=0)),])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)

cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(metrics.accuracy_score(y_test,predictions))
plt.title(all_sample_title, size = 15);
plt.show()

print('')
print(metrics.classification_report(y_test ,predictions))

# Using balanced dataset

Using a balanced dataset for training, I actually got a lower accuracy, but gave a higher correct predictions for ios as expected.

In [None]:
#X_train = train[['Title', 'Body']]
X_train = balanced_dataset['Body']
y_train = balanced_dataset['Label'] 

#X_test = test[['Title', 'Body']]
X_test = test['Title']
y_test = test['Label'] 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
    
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                         ('clf', LinearSVC(random_state=0)),])

text_clf.fit(X_train, y_train)  

predictions = text_clf.predict(X_test)

cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Confusion Matrix - score:'+str(metrics.accuracy_score(y_test,predictions))
plt.title(all_sample_title, size = 15);
plt.show()

print('')
print(metrics.classification_report(y_test ,predictions))