# Exploratory data analysis

# Retrieving the Data

In [None]:
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn (statistic library)

import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import plotly.tools as tls



from io import StringIO



In [None]:
print("Reading Data......")

#periods_train = pd.read_csv('E:/PROJET/Avito_Demand_Prediction/input/periods_train.csv', parse_dates=["activation_date", "date_from", "date_to"])
#periods_test = pd.read_csv('E:/PROJET/Avito_Demand_Prediction/input/periods_test.csv', parse_dates=["activation_date", "date_from", "date_to"])

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

print("Reading Done....")

In [None]:
print("size of train data", train.shape)
print("size of test data", test.shape)
'''print("size of periods_train data", periods_train.shape)
print("size of periods_test data", periods_test.shape)'''

# 3- Glimpse of Data
## 3.1 Overview of tables

### Train data 

In [None]:
train.head()

### Test data

In [None]:
test.head()

### Periods train data

In [None]:
periods_train = pd.read_csv('../input/periods_train.csv', parse_dates=["activation_date", "date_from", "date_to"])
periods_test = pd.read_csv('../input/periods_test.csv', parse_dates=["activation_date", "date_from", "date_to"])

In [None]:
periods_train.head()

## 3.2 Statistical overview of the Data
### Training Data some little info

In [None]:
train.info()

### Little description of training data for numerical features

In [None]:
train.describe()

### Little description of training data for categorical features

In [None]:
train.describe(include=["O"])

## 4. Data preparation
### ** I- Train data **
### checking missing data in training data

In [None]:
# checking missing data in train data 
# isnull return TRUE if the value NAN, ' ',  exist in dataset
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()*100/train.isnull().count()).sort_values(ascending = False)
missing_train_data =pd.concat([total, percent], axis = 1, keys=['total', 'percent'])
missing_train_data.head(10)

### checking missing data in periods training data

In [None]:
total = periods_train.isnull().sum().sort_values(ascending = False)
percent = (periods_train.isnull().sum()*100/periods_train.isnull().count()).sort_values(ascending = False)
missing_periods_train = pd.concat([total, percent], axis='columns', keys=['total', 'percent'])
missing_periods_train


### ** Test data **
### Checking missing data in test data 

In [None]:
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()*100).sort_values(ascending=False)
missing_test = pd.concat([total, percent], axis = 1, keys = ['total', 'percent'])
missing_test

### Checking missing data in periods test data 

In [None]:
total = periods_test.isnull().sum().sort_values(ascending=False)
percent = (periods_test.isnull().sum()/periods_test.isnull().count()*100).sort_values(ascending=False)
missing_periods_test = pd.concat([total, percent], axis = 1, keys = ['total', 'percent'])
missing_periods_test

 ## 5. Data Exploration
### 5.1 Histogram and distribution of deal probability

In [None]:
plt.figure(figsize = (12, 8)) #figsize = (12, 8)
sns.distplot(train['deal_probability'])
plt.xlabel('likelihood that an ad sold something', fontsize = 12)
plt.title("Histogram of probability that an ad actually sold something")
plt.show()

plt.figure(figsize = (12, 8))
plt.scatter(range(train.shape[0]), np.sort(train.deal_probability.values))
plt.xlabel('likelihood that an ad actually sold something', fontsize=12)
plt.title("Distribution of likelihood that an ad actually sold something")

### 5.2 Histogram and distribution of Ad price

In [None]:
plt.figure()
sns.distplot(train['price'].dropna())
plt.xlabel('Advertisement Price')
plt.title("Histogram of Ad price")

plt.figure()
plt.scatter(range(train.shape[0]), np.sort(train.price.values))
plt.xlabel('Ad price', fontsize=12)
plt.title("Distribution of Ad price")
plt.show()


In [None]:
train['deal_class'] = train['deal_probability'].apply(lambda x:'>= 0.5' if x >= 0.5 else '<0.5')
temp = train['deal_class'].value_counts()
labels = temp.index
sizes = (temp/temp.sum()*100)
trace = go.Pie(labels = labels, values = sizes, hoverinfo = 'label+percent')
layout = go.Layout(title='Distribution of deal class')
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

del train['deal_class']

* ** we notice that 88% of training data have less than 0.5 deal probabilty and 12% having deal probabilty more or equal than 0.5**

***to make our data set more comprehonsive we will translate russian region into english 
 the function remove_duplicates serve to remove all duplicates rows excisting in a colum and display a list contain without duplication *****

In [None]:
'''def remove_duplicates(column):
    newlist = []
    for row in column:
       if row not in newlist:
           newlist.append(row)
    return newlist

remove_duplicates(train['region'])'''

# without function remove_duplicates
newlist = []
for row in train['region']:
    if row not in newlist:
        newlist.append(row)
print(newlist)
print(len(newlist)) #count elements in list 

In [None]:
from io import StringIO

conversion = StringIO("""
region,region_english
Свердловская область, Sverdlovsk oblast
Самарская область, Samara oblast
Ростовская область, Rostov oblast
Татарстан, Tatarstan
Волгоградская область, Volgograd oblast
Нижегородская область, Nizhny Novgorod oblast
Пермский край, Perm Krai
Оренбургская область, Orenburg oblast
Ханты-Мансийский АО, Khanty-Mansi Autonomous Okrug
Тюменская область, Tyumen oblast
Башкортостан, Bashkortostan
Краснодарский край, Krasnodar Krai
Новосибирская область, Novosibirsk oblast
Омская область, Omsk oblast
Белгородская область, Belgorod oblast
Челябинская область, Chelyabinsk oblast
Воронежская область, Voronezh oblast
Кемеровская область, Kemerovo oblast
Саратовская область, Saratov oblast
Владимирская область, Vladimir oblast
Калининградская область, Kaliningrad oblast
Красноярский край, Krasnoyarsk Krai
Ярославская область, Yaroslavl oblast
Удмуртия, Udmurtia
Алтайский край, Altai Krai
Иркутская область, Irkutsk oblast
Ставропольский край, Stavropol Krai
Тульская область, Tula oblast
""")

conversion = pd.read_csv(conversion)
train = pd.merge(train, conversion, how="left", on="region")
#del train['region_english_x', 'region_english_y']

train.head()

In [None]:
#columns = ['region_english_x', 'region_english_y']
#train.drop(columns, inplace=True, axis = 1)
train['region_english'].head()

### 5.3 Distribution of different Ad regions

In [None]:
temp = train['region_english'].value_counts()
labels = temp.index
sizes = (temp / temp.sum())*100
trace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')
layout = go.Layout(title='Distribution of differnet Ad regions')
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

### Distribution of different Ad parent_category_name

In [None]:
from io import StringIO

conversion = StringIO("""
parent_category_name,parent_category_name_en
Личные вещи,Personal belongings
Для дома и дачи,For the home and garden
Бытовая электроника,Consumer electronics
Недвижимость,Real estate
Хобби и отдых,Hobbies & leisure
Транспорт,Transport
Услуги,Services
Животные,Animals
Для бизнеса,For business
""")

conversion = pd.read_csv(conversion)
train = pd.merge(train, conversion, how="left", on="parent_category_name")

In [None]:
temp = train['parent_category_name_en'].value_counts()
labels = temp.index
sizes = (temp / temp.sum())*100
trace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')
layout = go.Layout(title='Distribution of differnet Ad parent_category_name_en')
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## TOP FIVE
### Top 5 Ad titles

In [None]:
temp = train["title"].value_counts().head(20)
print("Top 5 Ad titles :\n", temp.head(5))
print("Total Ad titles : ",len(train["title"]))
trace = go.Bar(
    x = temp.index,
    y = temp.values,
)
data = [trace]
layout = go.Layout(
    title = "Top Ad titles", xaxis=dict( title='', tickfont=dict( size=14,color='rgb(107, 107, 107)')),
    yaxis=dict(title='Count of Ad titles', titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)


**Top 5 Ad titles are :**
1. Платье(Dress)
1. Туфли (Shoes)
1. Куртка(Jacket)
1. Пальто (Coat)
1. Джинсы(Jeans)

### Top 5 Ad city

In [None]:
temp = train["city"].value_counts().head(20)
print('Top 5 Ad cities :\n', temp.head(5))
print("Total Ad cities : ",len(train["title"]))
trace = go.Bar(
    x = temp.index,
    y = temp.values,
)
data = [trace]
layout = go.Layout(
    title = "Top Ad city",
    xaxis=dict( title='', tickfont=dict( size=14, color='rgb(107, 107, 107)')
    ),
    yaxis=dict( title='Count of Ad cities', titlefont=dict( size=16, color='rgb(107, 107, 107)'),
        tickfont=dict(size=14, color='rgb(107, 107, 107)')
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**Top 5 Ad cities :
1. Краснодар (Krasnodar)
1. Екатеринбург (Yekaterinburg)
1. Новосибирск (Novosibirsk)
1. Ростов-на-Дону (Rostov-on-don)
1. Нижний Новгород (Nizhny Novgorod)

### Top 5 Ad regions

In [None]:
temp = train["region_english"].value_counts().head(20)
print('Top 5 Ad regions :\n',temp.head(5))
print("Total Ad regions : ",len(train["title"]))
trace = go.Bar(
    x = temp.index,
    y = temp.values,
)
data = [trace]
layout = go.Layout(
    title = "Top Ad regions", xaxis=dict( title='',
        tickfont=dict( size=14, color='rgb(107, 107, 107)') ),
    yaxis=dict( title='Count of Ad regions', titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        tickfont=dict(size=14, color='rgb(107, 107, 107)')
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**Top 5 Ad regions :**
1. Krasnodar Krai
1. Sverdlovsk oblast
1. Rostov oblast
1. Tatarstan
1. Chelyabinsk oblast

### Top 5  ad category as classified by Avito's ad mode

In [None]:
temp = train["category_name"].value_counts().head(20)
print("Top 5 Fine grain ad category as classified by Avito's ad mode : \n", temp.head(5))
print("Total ad category as classified by Avito's ad mode : ",len(train["title"]))
trace = go.Bar(x = temp.index,y = temp.values,)
data = [trace]
layout = go.Layout(
    title = "Top ad category as classified by Avito's ad mode",
    xaxis=dict(
title='ad category as classified by Avitos ad mode',tickfont=dict(size=14,color='rgb(107, 107, 107)')),
    yaxis=dict(title='Count of ad category',titlefont=dict(size=16,color='rgb(107, 107, 107)'),
        tickfont=dict(size=14,color='rgb(107, 107, 107)')))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**Top 5 ad category as classified by Avito's ad mode :**
1. Clothing, shoes and accessories
1. Children clothing and shoes
1. Childrens product and toys
1. Apartments
1. Phones

### Top 5 Top level (parent)  ad category as classified by Avito's ad model

In [None]:
conversion = StringIO("""
parent_category_name,parent_category_name_english
Личные вещи,Personal belongings
Для дома и дачи,For the home and garden
Бытовая электроника,Consumer electronics
Недвижимость,Real estate
Хобби и отдых,Hobbies & leisure
Транспорт,Transport
Услуги,Services
Животные,Animals
Для бизнеса,For business
""")

conversion = pd.read_csv(conversion)
train = pd.merge(train, conversion, on="parent_category_name", how="left")


temp = train["parent_category_name_english"].value_counts()
print("Total Top level ad category as classified by Avito's ad model : ",len(train["title"]))
trace = go.Bar(x = temp.index,y = (temp / temp.sum())*100,)
data = [trace]
layout = go.Layout(title = "Top level ad category as classified by Avito's ad model",
    xaxis=dict(title='Top level ad category as classified by Avitos ad model',
        tickfont=dict(size=14,color='rgb(107, 107, 107)')),
    yaxis=dict(title='Count of Top level ad category in %',titlefont=dict(size=16,color='rgb(107, 107, 107)'),
        tickfont=dict(size=14,color='rgb(107, 107, 107)')))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**Top 5 Top level ad category as classified by Avito's ad model :

1. Personal belongings - 46 %
1. For the home and garden - 12 %
1. Consumer electronics - 12 %
1. Real estate - 10 %
1. Hobbies & leisure - 6 %

> ## Price price in relation to Deal probability

In [None]:
plt.figure(figsize=(15,6))
plt.scatter(np.log(train.price), train.deal_probability)
plt.xlabel('Ad price')
plt.ylabel('deal probability')
plt.show()

## Distribution of user type

In [None]:
temp = train['user_type'].value_counts()
labels = temp.index
sizes = (temp / temp.sum())*100
trace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')
layout = go.Layout(title='Distribution of user type')
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**Distribution of user types :**
1. Private users constitutes 71.6 % data
1. Comapny users constitutes 23.1 % data
1. Shop users constitutes 5.35 % data

## Monthly distribution of Ad prices in different regions 

In [None]:
train['activation_date'] = pd.to_datetime(train['activation_date'])
train['month'] = train.activation_date.dt.month
pr = train.groupby(['region_english', 'month'])['price'].mean().unstack()
#pr = pr.sort_values([12], ascending=False)
f, ax = plt.subplots(figsize=(15, 20)) 
pr = pr.fillna(0)
temp = sns.heatmap(pr, cmap='Reds')
plt.show()

**Highest Ad prices is in Irkutsk oblast region followed by Krasnodar Krai region**