In [None]:
# All import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from bs4 import BeautifulSoup
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('../input/crowdflower/train.csv')
df.head()

In [None]:
df.shape

## Understanding about the train data

1. Id 
2. Query -- Text about the query.
3. Product_title -- Text about the title of the product.
4. Prodcut_description -- Text about the product description.
5. Median_relevance -- Median relevance score given by 3 raters
6. relevance_variance -- Variance of the relevant scores given by these three raters. 

There are 10158 training data points.


In [None]:
df.median_relevance.value_counts()[-1::-1]


In [None]:
df.median_relevance.value_counts()
x = range(4)
y = df.median_relevance.value_counts()[-1::-1]


In [None]:
ax = df.median_relevance.value_counts()[-1::-1].plot(kind='bar',color=['C0', 'C1', 'C2','C3'])
for i,j in zip(x,y):
    print(i,j)
    plt.annotate(str(round(j/df.shape[0]*100,2))+'%', xy=(i,j), ha='center', va='bottom')
plt.title('Bar plot of the target values')
plt.xlabel('Target values')
plt.show()

## Analysis:

1. Data is very imbalanced
    1. Most of the datapoints are from class - 4
    2. Very few datapoints belong to class - 1

In [None]:
ax = missingno.matrix(df)

In [None]:
print('Percentage of points that are having missing values',round(sum(df.product_description.isna())/df.shape[0]*100,2),'%')

In [None]:
# I cannot just drop the 24.06% of the points randomly.
# I will fill the missing values with an empty string
df.product_description.fillna('',inplace=True)

In [None]:
# Just confirming if there are any more NAN values.
df.info()

## Data Analysis of the feature 'Query'

In [None]:
y = df['query'].value_counts().tolist()

In [None]:
y  = sorted(y,reverse=True)
x = range(len(y))
plt.bar(x,y)
plt.ylabel('Count')
plt.xlabel('Category')
plt.title('Bar plot of the query text categories and the count')
plt.show()

In [None]:
df['query'].value_counts()

#### Analysis of Query text

1. Total 261 unique categories
2. Most of the categories are repeated in the range 30-50 times
3. Wireless mouse is the most common query occuring 113 times in the data
4. Dollhouse bathtub was present the least times in the data - 8 times

## Analysis of the feature 'Product title'

In [None]:
y  = df.product_title.value_counts()

In [None]:
out = sum(df.product_title.value_counts()>1)
print('{} product titles are repeated more than once'.format(out))

In [None]:
out = sum(df.product_title.value_counts()==1)
print('{} product titles occured only once'.format(out))
print('Which is {}% of total data'.format(round(out/df.shape[0]*100,2)))

In [None]:
print('Max length of product title is',max(df.product_title.str.len()))

In [None]:
print('Min length of product title is',min(df.product_title.str.len()))

In [None]:
product_title_len = df.product_title.str.len()

In [None]:
plt.plot(sorted(product_title_len))

In [None]:
product_title_len.plot.kde()

In [None]:
sns.kdeplot(
    data=product_title_len,
    cumulative=True
)

In [None]:
for i in range(0,100,10):
    print('{}th percentile value is {}'.format(i,np.percentile(product_title_len,i)))

In [None]:
for i in range(90,100,1):
    print('{}th percentile value is {}'.format(i,np.percentile(product_title_len,i)))

In [None]:
start = 99
for i in range(10):    
    print('{:.1f}th percentile value is {:.1f}'.format(start,np.percentile(product_title_len,start)))
    start+=0.1

In [None]:
out= sum(product_title_len>145)
print('{} product titles have length>145 i.e greater than 99.8 percentile'.format(out))


In [None]:
sns.histplot(data = df, x = 'product_title_len', y = 'median_relevance')

In [None]:
df['product_title_len'] = df.product_title.str.len()

#### We cannot draw any analysis based on the product title length as all the classes are overlapping

## EDA Product Description

In [None]:
df.head()

In [None]:
df['product_description_len'] = df.product_description.str.len()

In [None]:
print('Max length of product title is',max(df.product_description_len))

In [None]:
print('Min length of product title is',min(df.product_description_len))

In [None]:
plt.plot(sorted(df.product_description_len))

In [None]:
df.product_description_len.plot.kde()

In [None]:
sns.kdeplot(
    data=df.product_description_len,
    cumulative=True
)

In [None]:
for i in range(0,100,10):
    print('{}th percentile value is {}'.format(i,np.percentile(df.product_description_len,i)))

In [None]:
for i in range(90,100,1):
    print('{}th percentile value is {:.1f}'.format(i,np.percentile(df.product_description_len,i)))

In [None]:
start = 99
for i in range(10):    
    print('{:.1f}th percentile value is {:.1f}'.format(start,np.percentile(df.product_description_len,start)))
    start+=0.1

In [None]:
start = 99.9
for i in range(10):    
    print('{:.2f}th percentile value is {:.1f}'.format(start,np.percentile(df.product_description_len,start)))
    start+=0.01

In [None]:
# we can truncate sentneces of length>10,000 as 99.96% of the sentences have length <99.97 percentile

In [None]:
out= sum(df.product_description_len>10**3)
print('{} product titles have length>145 i.e greater than 99.96 percentile'.format(out))


In [None]:
sns.histplot(data = df, x = 'product_description_len', y = 'median_relevance')

## Check for HTML values in each of the text column

In [None]:
def check_html(series):
    count = 0
    for item in series:
        if(bool(BeautifulSoup(item, "html.parser").find())):
            count+=1
            
    return count
    

In [None]:
start = datetime.now()
check_html(df['query'])
end = datetime.now()
print('Time take to run this cell is',end-start)

In [None]:
start = datetime.now()
check_html(df['product_title'])
end = datetime.now()
print('Time take to run this cell is',end-start)

In [None]:
start = datetime.now()
check_html(df['product_description'])
end = datetime.now()
print('Time take to run this cell is',end-start)

In [None]:
df.loc[df.id==290]

In [None]:
## Data Analysis of the feature 'Query'