In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")

In [None]:
df.head()

In [None]:
print("Shape of Dataset",df.shape)

In [None]:
df.info()

Step 2: Data Understanding and Cleaning

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
# Let's fill the answer with maximun number of repeating answer. Since only 2 of them is missing
df[' Answer'] = df[' Answer'].fillna(df[' Answer'].mode()[0])

In [None]:
# drop duplicate values
df.drop_duplicates()

df.shape

No NULL and duplicate values present in dataset

# **Step 3: Visualizing the Data**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df.describe(percentiles=[.25,.5,.75,.90,.95,.98,.99])

In [None]:
df.columns

In [None]:
df = df.rename(columns = {"Show Number":"Show_Number"})
df = df.rename(columns = {" Air Date":"Air_Date"})
df = df.rename(columns = {" Round":"Round"})
df = df.rename(columns = {" Category":"Category"})
df = df.rename(columns = {" Value":"Value"})
df = df.rename(columns = {" Question":"Question"})
df = df.rename(columns = {" Answer":"Answer"})

In [None]:
df['Category'].value_counts()

In [None]:
#total categories
len(df['Category'].unique())

In [None]:
#total unique shows
len(df['Show_Number'].unique())

In [None]:
#total unique shows
len(df['Value'].unique())

In [None]:
df['Value'].value_counts()

In [None]:
count=0
for val in df['Value']:
    if "$" not in val:
        count=count+1
print(count)
print("percentage of invalid value",count/len(df))

In [None]:
#removing invalid values

df = df[df['Value'] != "None"]
df.shape

In [None]:
#converting value to numeric datatype
def converticToNum(x):
    #print(x)
    return int(x[1:].replace(",","").replace(" ",""))
df['Value'] = df['Value'].apply(converticToNum)

In [None]:
df['Value'].describe(percentiles=[.25,.5,.75,.90,.95,.98,.99])

In [None]:
#visualizing target variable

sns.distplot(df['Value'],bins=10)
plt.show()

value coulmun is skewed, creating bins to reduce feature values

In [None]:
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

df['Value'] = df['Value'].apply(binning)


In [None]:
df['Value'].value_counts()

In [None]:
#removing 20000 as count is very less
df = df[df['Value'] != 20000] 

In [None]:
df['Round'].value_counts()

In [None]:
#percentage count
df['Round'].value_counts()/len(df)

In [None]:
sns.countplot(df['Round'])

In [None]:
#remove all the rounds with value Final Jeopardy and Tiebreaker as they account for 0.016% and 0.000014% of total values

In [None]:
df = df[df['Round'] !="Tiebreaker"]
df = df[df['Round'] !="Final Jeopardy!"]

In [None]:
df.shape

In [None]:
sns.countplot(df['Round'])

# **Logistic Regression model**

In [None]:
df.sample(n=5)

In [None]:
import re
def rephrase(sent):
    sent = re.sub(r"won't", "will not", sent)
    sent = re.sub(r"can\'t", "can not", sent)
    sent = re.sub(r"\'m", " am", sent)
    sent = re.sub(r"n\'t", " not", sent)
    sent = re.sub(r"\'s", " is", sent)
    sent = re.sub(r"\'d", " would", sent)
    sent = re.sub(r"\'re", " are", sent)
    sent = re.sub(r"\'ll", " will", sent)
    sent = re.sub(r"\'t", " not", sent)
    sent = re.sub(r"\'ve", " have", sent)
    
    return sent

In [None]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

stopWords

negative_words=["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't","mightn't","mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","won't","wouldn't"]


In [None]:
stopwords=[]
for word in list(stopWords):  # iterating on a copy since removing will mess things up
    if word not in negative_words:
        stopwords.append(word)


In [None]:

preprocessed_question = []
for sentance in (df['Question'].values):
    sent = rephrase(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e not in stopwords)
    preprocessed_question.append(sent.lower().strip())

In [None]:
df['Preprocessed_Question'] = preprocessed_question    #create new column having name  with preprocessed data
df.drop(['Question', 'Air_Date'], axis=1, inplace=True) #delete the column


In [None]:
df.head(5)

In [None]:
#train_test_split
from sklearn.model_selection import train_test_split
target=df['Value']
data_train, data_test, data_y_train, data_y_test = train_test_split(df, target, test_size=0.33, stratify=target)
data_train, data_cv,data_y_train, data_y_cv = train_test_split(data_train, data_y_train, test_size=0.33, stratify=data_y_train)

In [None]:
data_train.sample(n=5)

In [None]:
# Question
#https://stackoverflow.com/questions/48090658/sklearn-how-to-incorporate-missing-data-when-one-hot-encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer3 = CountVectorizer(lowercase=False, binary=True, max_features=2000)
vectorizer3.fit(project_data_train['Preprocessed_Question'].values)
#print(vectorizer3.get_feature_names())

feat_1_train = vectorizer3.transform(project_data_train['Preprocessed_Question'].values)
feat_1_cv = vectorizer3.transform(project_data_cv['Preprocessed_Question'].values)
feat_1_test = vectorizer3.transform(project_data_test['Preprocessed_Question'].values)

print("After vectorizations")
print(feat_1_train.shape, project_data_y_train.shape)
print(feat_1_cv.shape, project_data_y_cv.shape)
print(feat_1_test.shape, project_data_y_test.shape)
print("="*100)
