In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.<span style="color:red">some **This is Red Bold.** text</span>

## <span style="color:Red"> Phase 1. Data Loading and Data Cleaning  </span> ##

## <span style="color:blue"> Importing all Standard Library Required for Development </span> ##

In [None]:
import numpy as np                             # linear algebra
import pandas as pd                            # Data processing, CSV file I/O (e.g. pd.read_csv)
import re                                      # for Data Cleaning 
import matplotlib.pyplot as plt                # For Visualization  
import re                                      # For data Cleaning 

from tqdm import tqdm                          # For ProgressBar 
import nltk                                    # For preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

from sklearn import metrics                    # For Accuracy Measure 
from sklearn.metrics import accuracy_score


### <span style="color:blue"> Function To Load Data From File  </span> ###

In [None]:
# Function To Load Data from File into Pandas Datframe
def Load_data(Data_path,show_info = False):
    data = pd.read_csv(Data_path)
    if show_info:
        print(data.info())
    return data

### <span style="color:blue"> Loading Data  </span> ###

In [None]:
Data_File_Location = "../input/mbti-type/mbti_1.csv"
data = Load_data(Data_File_Location,True)

### <span style="color:blue"> Function for Data Visualization and getting Data Insights    </span> ###

In [None]:
# Function To check Classes in data 
def count_class(DataFrame,count = False ,plot = False):
    # considering 1st Column is for classes 
    Classes = list(data[data.columns[0]].unique())
    #print(Classes)
    if plot or count:
        count_type = data.groupby('type').count()
    if count : print(count_type)
    if plot:
        fig = plt.figure()
        ax = fig.add_axes([0,0,2,2])
        count_type_temp = count_type.sort_values('posts')
        ax.bar(count_type_temp.index,count_type_temp['posts'])
        plt.show()
    return Classes

In [None]:
classes = count_class(data,True,True)

In [None]:
data.iloc[0][1]

In [None]:
data.columns

In [None]:
data.shape[0]

## <span style="color:blue">   Below are the Functions for Data Preprocessing , Here Data Preprocessing is done in Stages </span> ##

In [None]:
# Function To replace "|||" from text with " " Join all texts written by 1 Person 
def replace_sep(text):
    """Remove '|||' which is used as seprator """
    text = text.replace("|||"," ")
    return text

# Function To remove Links from text and replace them with 'Link' 
def remove_link(text):
    """Replace Links from text to 'Link' """
    text = re.sub(r"http\S+", "Link", text, flags=re.MULTILINE)
    return text

# Function To Remove punctuation from Text 
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
    

In [None]:
replace_sep(data.iloc[0][1])

In [None]:
remove_link(data.iloc[0][1])

### <span style="color:blue"> Pre-Processing Stage -1   </span> ###

In [None]:
def pre_processing_stage_1(text):
    text = replace_sep(text)  # Calling Function to remove "|||" seprator and join all texts 
    text = remove_link(text) # calling function to removes Links 
    text = text.lower()     # To convert whole text To lower
    return text

In [None]:
pre_processing_stage_1(data.iloc[0][1])

### <span style="color:blue"> Pre-Processing Satge -2  </span> ###

In [None]:
def pre_processing_stage_2(text):
    tokenized_text = word_tokenize(text)
    for word in tokenized_text:
        if word in stopwords.words('english'):
            tokenized_text.remove(word)
    tokenized_text = remove_punctuation(tokenized_text)
    for i in range(len(tokenized_text)):
        tokenized_text[i] = stemmer.stem(tokenized_text[i])  # 
    final_text = " ".join(tokenized_text)
    return final_text

### <span style="color:blue">  Final Functiuon to Clean Data  </span> ###

In [None]:
stemmer = PorterStemmer()                     # Defining Stemmer for Stemming in pre_processing_stage_2

def Clean_Data(df):
    print("PreProcessing----------- ")
    for i in tqdm(range(df.shape[0])):
        text = df.iloc[i][1]                      # Getting data from DataFrame to Text varibale to Preprocess
        text = pre_processing_stage_1(text)       # calling Function to merge texts and Do 1st level pre-processing  
        text = pre_processing_stage_2(text)
        df.set_value(i,'posts',text)
    return df

### <span style="color:blue"> Data Cleaning In Progress  </span> ###

In [None]:
data = Clean_Data(data)

In [None]:
print(data.iloc[0][1])

### <span style="color:blue">   Storing cleaned Data in CSV for later Use </span> ###

In [None]:
#data.to_csv('Data.csv')

### <span style="color:blue"> Function for UpSampling and DownSampling is data is not equally Divided </span> ###

In [None]:
# This Function Will be used to make data equal for all Class 
def up_down_sampling(data,count):
    types = list(set(data.type))
    defined = False
    for tp in types :
        print(tp)
        if not defined:
            defined = True
            tp_class_count = data.type.value_counts()[tp]
            if tp_class_count > count :
                df = data[data['type'] == tp].sample(count)
            else:
                df = data[data['type'] == tp].sample(count,replace = True)
        else:
            tp_class_count = data.type.value_counts()[tp]
            if tp_class_count > count :
                df = pd.concat([df, data[data['type'] == tp].sample(count)], axis=0)
            else:
                df = pd.concat([df, data[data['type'] == tp].sample(count,replace = True)], axis=0)
    return df
            

In [None]:
# Using Up_down Sampling for preparing trainable data   
df = up_down_sampling(data,600)

In [None]:
# Checking Trainable data 
count_class = df.type.value_counts()
count_class

### <span style="color:blue"> Shuffling data after sampling is done </span> ###

In [None]:
#Suffling DataFrame 
df = df.sample(frac = 1)

### <span style="color:blue"> seprating input and output Data  </span> ###

In [None]:
# Text Written By (Input)
text = df.posts

# Personality Type (OutPut)
cator = df.type

### <span style="color:Red"> Phase 2. Feature Extraction   </span> ###

#### <span style="color:blue">  In this Phase  We have Used count Vectorizer and TFIDF score for feature extraction . Then used Train test split for spliting data   </span> ####

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
vector = CountVectorizer(ngram_range = (2,2))

In [None]:
vector.fit(text)

In [None]:
X = vector.transform(text)

In [None]:
Y = np.array(cator)#.reshape(-1,1)

In [None]:
tfidf_transformer = TfidfTransformer()

In [None]:
X_final =tfidf_transformer.fit_transform(X) 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X_final,Y,test_size = 0.20)

### <span style="color:Red"> Phase 3. Model Creation , Training and Testing     </span> ###

### <span style="color:Green">  Model -1. Multinomial Naive Bayes Classifier [](http://)  </span> ###

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Defining Model 
cls = MultinomialNB()

In [None]:
# training Model 
cls.fit(X_train,Y_train)

In [None]:
# Testing model on Test Set 
res = cls.predict(X_test)

In [None]:
print("Accuracy Of Model 1 is :",accuracy_score(res,Y_test)*100)

### <span style="color:Green">  Model -2. Tree Based  Classifier  </span> ###

In [None]:
from sklearn import tree

In [None]:
# Defining Model 
classifier_2 = tree.DecisionTreeClassifier()

In [None]:
# Training Model 
classifier_2.fit(X_train,Y_train)

In [None]:
# Testing Model on Test Data 
result_2 = classifier_2.predict(X_test)

In [None]:
print("Accuracy Of Model 2 is :",accuracy_score(result_2,Y_test)*100)

#### <span style="color:Green"> Note :  we have seen that for these two models out output is 60-70% Correct , we Can Improve this by using Neural Networks but that requires extensive training  </span> ####