![ML_models.jpg](https://lh3.googleusercontent.com/7AqWfylPqHIoe0LrUN-N5YY_OnEJhOjdb_-1EwEjrUvA7wSxxMigtpfiKwVnXD30hamAvxy4angTczmHStoGSzQqTsTxYjRBrlQrTbtT-ZczOwXJQy6ff6GQP3a5V7nsKn6pJV-tmlsdceFhlY93ULyoj_zv7SMBZRoTpQoxTpgkEADkvmM1hQ2kM5FrrfdueusgR4HS8OzbACPEs_1vioNhtdC__E0gWe8HAcSo_O4Gwhsv8_qEa8mka9Y0d_KktQiXpPF91SBo36jzDBGtRQAeQqhANGvbfxTfnGiNGUD8rsmeSFwfMQJ9HKaR3xM1VCEd0VF2GT7qB--1yfCRjwnPvjc1lfDXaiPH6_Aywm8luWdd7iplXQ2-ib6sbVKJx5v_hIBqEGVAkzQktEk4uOyWjkYrUUvaCX_ZTCgMVbdDVQ_97mQ3j29PGa0iAMnSlRE0Zg8wK3NzCZOLUOlse9m-VaKY40WzWWyH0g2aTmhtbGOauMrG-RC5sGcQ7unWfRMDsmV9k8Y8a1sSa-VwYcqyp_BPdogGdA1q0WQ4vAUYTC-li7TmXId7-10X5k3_XHqxL8BUoMqF1njoiVNDwv9fWuSWgilsnWR5S_SiI5LDmPnwAgQnXiDrKa2UahMFL_c3mDrksdstInzBSfa6jGWvSj8CdnxOD0V-oeuYdwED4b4P2cgiUcq6isfzr2M2ki30qqLgjr0RjZfOzjdOkbBT-g=w1251-h415-no?authuser=0)

<h1 align='center'>Stackoverflow: 20 Tags Classifier</h1>

# Index
- [Objective.](#Objective)

- [Data to work.](#DATA_TO_WORK)

- [Data Cleaning.](#DATA_CLEANING)

- [EDA](#EDA)

- [Text Preprocessing.](#TEXT_PREPROCESSING)

- [Splitting Train Data and Test Data.](#SPLITTING_TRAIN_DATA_AND_TEST_DATA)

- [Naive-Bayes Classifier for Multinomial Models.](#Naive_Bayes_Classifier_for_Multinomial_Models)

- [Linear Support Vector Machine.](#Linear_support_vector_machine)

- [Logistic Regression.](#Logistic_regression)

- [Results.](#Results)


## Objective <a class="anchor" id="Objective"></a>
The objective of this work is to compare between the text classification models we trained in Scikit-Learn in order to choose the most accurate one for our classification problem.

In [None]:
# Libraries
import numpy as np 
import pandas as pd 
    # To plot
import matplotlib.pyplot as plt
    # To preprocess data
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
    # To machine learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    # To import data
import os

# other downloads
# nltk.download('stopwords')
# nltk.download('punkt')


In [None]:
# Kaggle
    # Input data files are available in the read-only "../input/" directory
    # Will list all files under the input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Uploading data
df = pd.read_csv("/kaggle/input/facebook-recruiting-iii-keyword-extraction/Train.zip")

        
# Colab
  # upload the "kaggle.json" with the "Kaggle API token" file manually
# ! pip install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download facebook-recruiting-iii-keyword-extraction -f Train.zip
# ! unzip Train.zip
# df = pd.read_csv("./Train.csv")

## <center style="background-color:Gainsboro; width:80%;">DATA TO WORK</center>
<a class="anchor" id="DATA_TO_WORK"></a>

In [None]:

# List of tags I want to study.
list_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']

# Df to work in (just those rows with the tags I gonna study).
df_tags = df[(df['Tags'].isin(list_tags))][['Title','Body', 'Tags']]
# Joining columns title and body.
df_tags['post'] = df_tags['Title'] + ' ' + df_tags['Body']
# Removing extra columns.
df_tags = df_tags.drop('Title', axis = 1).drop('Body', axis = 1)

df_tags.head(2)

In [None]:
print('Original Df length: \t' + str(len(df)))
len_df_tags = len(df_tags)
print('Df_Tags length: \t' + str(len_df_tags))


In [None]:
# Freeing RAM memory
del(df)

<a class="anchor" id="DATA_CLEANING"></a>
## <center style="background-color:Gainsboro; width:80%;">DATA CLEANING</center>

In [None]:
df_tags.isna().sum()

In [None]:
df_tags = df_tags.drop_duplicates()
print('¿Cuántas celdas había?: ' + str(len_df_tags))
len_duplicated = len_df_tags-len(df_tags)
print('¿Cuántas celdas se restaron? (por estar duplicadas): ' + str(len_duplicated) + '\t(Esto equivale al ' + str(round((len_duplicated*100)/len_df_tags,2)) + '%)')
print('¿Cuántas celtas quedaron?: ' + str(len(df_tags)))


<a class="anchor" id="EDA"></a>
## <center style="background-color:Gainsboro; width:80%;">EDA</center>

In [None]:
plt.figure(figsize=(10,4))
df_tags.Tags.value_counts().plot(kind='bar')

In [None]:
%%time

from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from nltk.tokenize import word_tokenize
from PIL import Image
# import requests
# from io import BytesIO 

for i,item in enumerate(list_tags):
  
  if i%2==0:
    tag1 = item
    tag2 = list_tags[i-1]

    # Title:
    title = '\t\t\t\t\t' + tag1.capitalize() + '\t\t\t\t\t\t\t' + tag2.capitalize()
    print(title)

    # Plot side by side
    fig, ax = plt.subplots(1,2, figsize=[20,10])

    # Wordcloud left
    words = " ".join(df_tags[df_tags['Tags']==tag1]['post'])
    def punctuation_stop(text):
        """remove punctuation and stop words"""
        filtered = []
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        for w in word_tokens:
            if w not in stop_words and w.isalpha():
                filtered.append(w.lower())
        return filtered
    words_filtered = punctuation_stop(words)
    text = " ".join([ele for ele in words_filtered])
    # wordcloud contour shape
    set_stopwords = set(STOPWORDS)
    mask = np.array(Image.open("../input/cloud-image-for-wordcloud/image.png"))
#     url = 'https://lh3.googleusercontent.com/usScoX99qC6K_ihnZyINhhmCs_L2H0o7gycoBrVronBooPaiDmtAPDLFvylLc5K37iTcTEqXKK044Y6OhZ_q3LeNSqrb9gHXt8xQh6Tf7GYr9S36M8Lzawfpc4Op1S0HXKhc7g_HLMGYGzmjsA72qU64fPqeRTrWCh9nySOnkkHftQyoKBF8-PvqEcAGhIpuOOr-rlNXLrciOpOnPWpg2y3TsAiAqgdqA76HI2WlCpClIZVcTWRUe9Yz43NrAbvdWtJcafXQQBcDqXe9pNeKbt9jAtQQ0Gvus1HfJj0zvAJgSdJ9ZKwi2RTx6EAYspNsUgNDJ4NIwHrXyjtCzrvp64MxwbP2UgdpG14ieqdy5_c8YbqIFKR3lgIyPM3bt3gOIdaCOAsm3Lqyf-2v8cMeY6UsLuwzRBthbI_-m5yM--YqgtQl8oj8kD-FPn2kWcyu8zliV6J5nQ1_he5Wws06nVWE_eZ7_CkvTFjzj6k_-OjCrtI6D1OaLQzM_Cgt_eLCqTMqMz1q7PTW5AP8tCCTemye2luU5v5zkaMPgsybkFE0XEewGL0X26ikbzGeOTjnSQoMKz5MrYEAfx8kaTii-wCkOP_f0h8C-PdsvtBgJ3qB9xx23Oq-cwCdrlWudyLaPBwAAPhVfCgPNk790SlmYu8hFoMBViuu2DCRWcQ0_pEg4vStLoYV2nWofvJ2Ozi3qx8wJvKeZdCcHpSh_nr8eT-nsw=w800-h600-no?authuser=0'
#     response = requests.get(url)
#     img = Image.open(BytesIO(response.content))
# #     img = Image.open(requests.get(url, stream=True).raw)
#     mask = np.array(img)
    
    
    wc= WordCloud(stopwords=set_stopwords,
                  background_color="white", 
                  colormap='cool',
                  max_words = 1500, 
                  width =800, height = 1500,
                  mask=mask, 
                  contour_width=3,
                  contour_color='#023075',
                  random_state=1)
    wc.generate(text)
    ax[0].axis('off')
    ax[0].imshow(wc,interpolation="bilinear")

    # Wordcloud right
    words2 = " ".join(df_tags[df_tags['Tags']==tag2]['post'])
    words_filtered2 = punctuation_stop(words2)
    text2 = " ".join([ele for ele in words_filtered2])
    wc2= WordCloud(stopwords=set_stopwords,
                  background_color="white", 
                  colormap='cool',
                  max_words = 1500, 
                  width =800, height = 1500,
                  mask=mask, 
                  contour_width=3,
                  contour_color='#023075',
                  random_state=1)
    wc2.generate(text2)
    ax[1].axis('off')
    ax[1].imshow(wc2,interpolation="bilinear")

    plt.show()

<a class="anchor" id="TEXT_PREPROCESSING"></a>
## <center style="background-color:Gainsboro; width:80%;">TEXT PREPROCESSING</center>

In [None]:
%%time
en_stopwords = stopwords.words('english')

# HTML decoding.
df_tags['post'] = df_tags['post'].apply(lambda x: (BeautifulSoup(x, 'lxml').text).lower())
# Replacingsymbols by space in text.
df_tags['post'] = df_tags['post'].str.replace('[/(){}\[\]\|@,;]', ' ', regex=True)
# df_tags['post'] = df_tags['post'].apply(lambda x: re.sub('[/(){}\[\]\|@,;]','  ',x.lower()))
# Deletting symbols from text.
df_tags['post'] = df_tags['post'].str.replace('[^0-9a-z #+_]', '', regex=True)
# df_tags['post'] = df_tags['post'].apply(lambda x: re.sub('[^0-9a-z #+_]',' ',x.lower()))
# Deleting stop words.
df_tags['post'] = df_tags['post'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

df_tags.head(5)

In [None]:
df_tags['post'].iloc[1]

<a class="anchor" id="SPLITTING_TRAIN_DATA_AND_TEST_DATA"></a>
## <center style="background-color:Gainsboro; width:80%;">SPLITTING TRAIN DATA AND TEST DATA</center>
The 70% of the data to train and 30% of the data to test.

In [None]:
X = df_tags.post
y = df_tags.Tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

<a class="anchor" id="Naive_Bayes_Classifier_for_Multinomial_Models"></a>
# <center style="background-color:Gainsboro; width:80%;">Naive Bayes Classifier for Multinomial Models</center>

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

In [None]:
# %%time
y_pred = nb.predict(X_test)

accuracy_Naive_Bayes = accuracy_score(y_pred, y_test)
print('accuracy %s' % accuracy_Naive_Bayes)
print(classification_report(y_test, y_pred,target_names=list_tags))

<a class="anchor" id="Linear_support_vector_machine"></a>
# <center style="background-color:Gainsboro; width:80%;">Linear support vector machine</center>

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

In [None]:
%%time

y_pred = sgd.predict(X_test)

accuracy_Linear_SVM = accuracy_score(y_pred, y_test)
print('accuracy %s' % accuracy_Linear_SVM)
print(classification_report(y_test, y_pred,target_names=list_tags))

<a class="anchor" id="Logistic_regression"></a>
# <center style="background-color:Gainsboro; width:80%;">Logistic regression</center>

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, solver='liblinear')),
               ])
logreg.fit(X_train, y_train)

In [None]:
%%time

y_pred = logreg.predict(X_test)

accuracy_Logistic_Reg = accuracy_score(y_pred, y_test)
print('accuracy %s' % accuracy_Logistic_Reg)
print(classification_report(y_test, y_pred,target_names=list_tags))

<a class="anchor" id="Results"></a>
# <center style="background-color:Gainsboro; width:80%;">Results</center>

In [None]:
df_results = pd.DataFrame([{'Naive Bayes Classifier for Multinomial Models':accuracy_Naive_Bayes, 'Linear support vector machine':accuracy_Linear_SVM, 'Logistic regression':accuracy_Logistic_Reg}]).T
df_results = df_results.rename({0:'Accuracy'}, axis=1)
df_results = df_results.sort_values(by='Accuracy', ascending=False)

display(df_results)



In [None]:
print('The better results are given by: ' + df_results.index[0])