# Import python libraries

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk import FreqDist, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.utils import resample

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# merge Sherlock and Poirot data
df_h_p= pd.read_csv('./datasets/data_for_eda.csv')

### Setup Train and Test

In [3]:
# lets set up X and y
X=df_h_p[['title']]
y=df_h_p['subreddit']

In [4]:
# Check what we need to check in a classification problem.
# This is the baseline --> accuracy
y.value_counts(normalize=True)

1    0.616941
0    0.383059
Name: subreddit, dtype: float64

In [5]:
# Split the data into the training and testing sets.
X_train, X_test, y_train , y_test = train_test_split(
                                                        X,
                                                        y,
                                                        test_size = 0.33,
                                                        random_state = 42)

### Since there is Class imbalance lets Oversample Poirot class to make it even distribution

In [6]:
# lets merge the features and y data set
resamp = pd.concat([X_train,y_train], axis = 1)

In [7]:
# create a df for holmes
holmes = resamp[resamp['subreddit'] == 1]

In [8]:
# create a df for poirot
poirot = resamp[resamp['subreddit'] == 0]

In [9]:
# resample poirot replacing records to make it same lenght as holmes
poirot_resample_df = resample(poirot,replace = True,n_samples = len(holmes),random_state = 42 )

In [10]:
# lets combine holmes data set and poirot resampled data set
df_h_p_bal = pd.concat([holmes ,poirot_resample_df], axis = 0)

In [11]:
# lets look at the data set
df_h_p_bal.groupby(by = 'subreddit').count()

Unnamed: 0_level_0,title
subreddit,Unnamed: 1_level_1
0,328
1,328


In [12]:
# lets divide the data again into X_train and y_train
X_train = df_h_p_bal['title']
y_train = df_h_p_bal['subreddit']


### Export the Train Test data to be used for modelling

In [13]:
# export the file to CSv
X_train.to_csv("./datasets/X_train.csv" , index = False)
y_train.to_csv("./datasets/y_train.csv", index = False)
X_test.to_csv("./datasets/X_test.csv", index = False)
y_test.to_csv("./datasets/y_test.csv", index = False)