In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Table of Contents
<ul>
<li><a href="#intro">Introduction</a></li>
<li><a href="#wrangling">Data Wrangling</a></li>
<li><a href="#eda">Exploratory Data Analysis</a></li>
<li><a href="#conclusions">Conclusions</a></li>
</ul>

<a id='intro'></a>
## Introduction

>I am going to perform data analysis process on ***nike and adidas***   
and this data set contains information about 3,000 rows containing information about nike and adidas products 
including ***basic information*** about products such as product name ,product id and brand   
and ***numeric information*** such as listing price ,sale price,discount and rating   
**the questions I want to answer about this data set are:**  


***1-How many products in each brand?       
2-How many products in each company?   
3-What is the highest rate and for which product
  ,brand and  company ?   
4-What is the highest  number of reviews and for which
  product , brand and company ?   
5-What is average rate for each company ?   
6-What is average number of reviews for each company?   
7-What is average value of sale price for each company?   
8-What is the relation between discount value and average value of sale price for each company?***
>


<a id='wrangling'></a>
## Data Wrangling
### General Properties

# Load data and check for cleanliness
load data from `Adidas Vs Nike.csv`   
showing structure of data (number of rows and columns)   
printout sample of data       
showing datatypes and counts of values for each column        
showing some statistical information about  numerical data columns such as :       
***(count,mean,std,5number summary)*** 

In [None]:
#load data as pandas dataframe
df= pd.read_csv('../input/adidas-vs-nike/Adidas Vs Nike.csv')
#2-printout how many rows and columns in the data
print( 'Number Of Rows And Columns In Dataset Is: \n',df.shape)
#3-printout sample of data (first 3 rows)
df.head(3)

In [None]:
#showing general information about data (data types of columns - counts of each columns values)
print(df.info())

In [None]:
#showing statistical information about  numerical data columns (count,mean,std,5number summary)
print(df.describe())

In [None]:
#number of missing values in each column
print("Number of missing data in each column is : \n", df.isnull().sum())

# information about columns with zero values 

In [None]:
#calculate number of zero values in Listing Price ,Reviews,Rating and Discount
number_zero_values_listing_price= df[df['Listing Price'] == 0].shape[0]
print( "number of zero values in listing price column is :\n "  ,number_zero_values_listing_price)
number_zero_values_reviews= df[df['Reviews'] == 0].shape[0]
print("number of zero values in reviews column is :\n ",number_zero_values_reviews)
number_zero_values_rating= df[df['Rating'] == 0].shape[0]
print("number of zero values in rating column is :\n ",number_zero_values_rating)
number_zero_values_discount= df[df['Discount'] == 0].shape[0]
print("number of zero values in discount column is :\n ",number_zero_values_discount)

# information about duplicate rows in dataset

In [None]:
#calculate duplicate rows in dataset
print("number of duplicated rows is :\n " ,df.duplicated().sum())

### Data Cleaning (1-change datatypes of Last Visted column)

In [None]:
#change Last Visited from object to datetime  
df['Last Visited'] = pd.to_datetime(df['Last Visited'])
df.info()

### Data Cleaning (2-Rename columns which names do not match with codes and send errors)

In [None]:
#rename_columns
df.rename(columns={'Product Name':'product_name','Product ID':'product_id','Listing Price':'listing_price','Sale Price':'sale_price','Last Visited':'last_visited'}, inplace=True)
df.tail(2)

### look for some unique values in some columns after renaming columns and thier names becomes valid for  unique codes

In [None]:
#1-number of unique product name (product_name)
print('number of unique product name is : \n ',df.product_name.nunique())
#2-number of unique product id(product_id)
print('number of unique product id is : \n ',df.product_id.nunique())
#3-number of unique last visited(last_visited)
print('number of unique  last visited is : \n ',df.last_visited.nunique())
#4-number of unique brands(Brand)
print('number of unique  brand is : \n ',df.Brand.nunique())

### common product name and brand in our dataset

In [None]:
#common product name
repeated_product_name=df['product_name'].mode()[0]
print('maximum repeated product name is : \n ' ,repeated_product_name )
max_repeated=df['product_name'].value_counts().max()
print('maximum repeated product name value is : \n ' ,max_repeated)

In [None]:
#common brand
repeated_brand=df['Brand'].mode()[0]
print('maximum repeatedbrand is : \n ' ,repeated_brand )
max_repeated_brand=df['Brand'].value_counts().max()
print('maximum repeated brand value is : \n ',max_repeated_brand)

### Data Cleaning (3- dealing with columns with missing data)
we will ***fill forward*** missing data in string columns datatype ***as a way to avoid deleting them***  

In [None]:
#fill forward missing data in string columns datatype as a way to avoid deleting them
df[['Description']]=df[['Description']].fillna(method='ffill')
#ensure that there is no columns with missing data 
number_of_columns_with_missing_data=df.isnull().any().sum()
print("Number of columns with missing data is:\n",number_of_columns_with_missing_data)

<a id='eda'></a>
## Exploratory Data Analysis
### Research Question1(How many products in each brand?)

In [None]:
#calculate number of products in each brand
products_per_brand=df.groupby('Brand')['product_name'].count()
print('number of products in each brand is : \n',products_per_brand.to_frame())
#plot bar chart to show variation in number of products in each brand
products_per_brand.plot(kind='bar',fontsize=14,figsize=(7,7));
#determine title and labels for the chart
plt.title('Number Of Products In Each Brand',fontsize = 13)
plt.xlabel('Brand',fontsize = 16)
plt.ylabel('Number Of Products',fontsize =13)
sns.set_style("whitegrid")

### Research Question 2 ( How many products in each company ?)

In [None]:
#create a new column for company  name
df['company']=df['Brand'].str[:6]
df.head()
print(df.shape)
df.tail(2)

In [None]:
#calculate number of products in each company
products_per_company=df.groupby('company')['product_name'].count()
print('number of products in each company is : \n',products_per_company.to_frame())
#plot bar chart to show variation in number of products in each company
products_per_company.plot(kind='bar',fontsize=14,figsize=(9,9));
#determine title and labels for the chart
plt.title('Number Of Products In Each Company',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Number Of Products',fontsize =13)
sns.set_style("darkgrid")


### Research Question 3 ( What is the highest rate and for which product , brand and company  ?)

In [None]:
#calculate the highest rate 
highest_rate=df['Rating'].max()
print('the highest rate value  is : \n',highest_rate)

In [None]:
#for which product,brand and company the highest rate
highest_product_rate=df['product_name'].iloc[df['Rating'].idxmax()]
print('the highest product in rate  is : \n',highest_product_rate)
highest_product_rate_info=df.iloc[df['Rating'].idxmax()]
highest_product_rate_info.to_frame()

### Research Question 4 ( What is the highest  number of reviews and for which product , brand and company  ?)

In [None]:
#calculate the maximum number of reviews
highest_review=df['Reviews'].max()
print('the maximum number of reviews is : \n',highest_review)

In [None]:
#for which product,brand and company the maximum number of reviews
highest_product_review=df['product_name'].iloc[df['Reviews'].idxmax()]
print('the highest product in reviews number  is : \n',highest_product_review)
highest_product_review_info=df.iloc[df['Reviews'].idxmax()]
highest_product_review_info.to_frame()

### Research Question 5 ( What is average rate for each company?

In [None]:
#calculate the average rate for each company
average_rate=round(df.groupby('company')['Rating'].mean(),1)
print('rates for each company : \n',average_rate.to_frame())
#plot bar chart to show variation in number of products in each company
average_rate.plot(kind='bar',fontsize=14,figsize=(9,9));
#determine title and labels for the chart
plt.title('Rates For Each Company',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Rates',fontsize =13)
sns.set_style("darkgrid")

### for only postive values of rate

In [None]:
#make a data frame by query to show only positive values in rating for each company
rates=pd.DataFrame(df.query('Rating>0'))
rates.tail(2)

In [None]:
#calculate the average rate for each company
average_rate=round(rates.groupby('company')['Rating'].mean(),1)
print('rates for each company(postive values only) is : \n',average_rate.to_frame())
#plot bar chart to show variation in number of products in each company
average_rate.plot(kind='bar',fontsize=14,figsize=(9,9));
#determine title and labels for the chart
plt.title('Rates For Each Company(postive values only)',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Rates',fontsize =13)
sns.set_style("darkgrid")

### Research Question 6( What is average number of reviews for each company?

In [None]:
#calculate the average number of reviews for each company
average_reviews=round(df.groupby('company')['Reviews'].mean(),1)
print('average number of reviews in each company is : \n',average_reviews.to_frame())
#plot bar chart to show variation in number of products in each company
average_rate.plot(kind='bar',fontsize=14,figsize=(9,9));
#determine title and labels for the chart
plt.title('Average Number Of Reviews In Each Company',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Average Number Of Reviews',fontsize =13)
sns.set_style("darkgrid")

### for only postive values of reviews

In [None]:
#make a data frame by query to show only positive values in reviews for each company
reviews=pd.DataFrame(df.query('Reviews>0'))
reviews.tail(2)

In [None]:
#calculate the average number of reviews for each company
average_reviews=round(reviews.groupby('company')['Reviews'].mean(),1)
print('average number of reviews in each company(postive values only) is : \n',average_reviews.to_frame())
#plot bar chart to show variation in number of products in each company
average_rate.plot(kind='bar',fontsize=14,figsize=(5,5));
#determine title and labels for the chart
plt.title('Average Number Of Reviews In Each Company(postive values only)',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Average Number Of Reviews',fontsize =13)
sns.set_style("darkgrid")

### Research Question 7( What is average value of sale price for each company?

In [None]:
#calculate the average value of sale price for each company
average_sale_price=round(df.groupby('company')['sale_price'].mean(),1)
print('average value of sale price in each company is : \n',average_sale_price.to_frame())
#plot bar chart to show variation in number of products in each company
average_sale_price.plot(kind='bar',fontsize=14,figsize=(5,5));
#determine title and labels for the chart
plt.title('Average Value Of Sale Price In Each Company',fontsize = 13)
plt.xlabel('Company',fontsize = 16)
plt.ylabel('Average Value Of Sale Price',fontsize =13)
sns.set_style("darkgrid")

### Research Question 8( What is the relation between discount value and average value of sale price for each company?

In [None]:
#calculate correlaton_cofficient between discount and sale price
correlaton_cofficient =round(df['Discount'].corr(df['sale_price']),1)
print('correlaton cofficient between discount and sale price is: \n', correlaton_cofficient)

In [None]:
#plot a scatter chart to show the correlation between discount and sale price
df.plot.scatter(x='Discount', y='sale_price');
#determine title and labels for the chart
plt.title('Relation Between Discount And Sale Price',fontsize = 15)
plt.xlabel('Discount',fontsize = 16)
plt.ylabel('sale_price',fontsize = 16)
sns.set_style("darkgrid")

<a id='conclusions'></a>
## Conclusions 
***1-(Adidas CORE / NEO) brand has the maximum number of products (1111)and    
      (Adidas Adidas ORIGINALS) brand has the minimum number of products
      with only one product***    
   
***2-Adidas has more prouducts than nike***    
***3-the highest rate value is (5) for (Women's adidas Running  Duramo 9 Shoes)proudct 
     for adidas company***    
***4-the maximum number of reviews is (223) for (Air Jordan 10 Retro) product
     for nike company***   
***5-the average rate for adidas is higher than nike but for only psitive rates nike is higher than adidas and rates in general is higher than for postive values only***   
***6-the average number of reviews of adidas is higher than nike and the average number of reviews in general is higher than for postive values only*** 
***7-the average value of sale price of adidas is higher than nike*** 
***8-There is a negative strong correlation between discount and sale price so if discount value is high so the sale price will be low*** 

