In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Table of Contents

Import Libraries

Basic understanding of data

Data

Statistics

Descriptive statistics

Visualization

Univariate Analysis

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../input/sales-records/100000 Sales Records.csv',thousands=',')

### Basic understanding of data

In [None]:
# check shape of the data
df.shape

In [None]:
# check column names
df.columns

In [None]:
# check first 5 row of the data
df.head()

In [None]:
# check last 5 rows of the data
df.tail()

In [None]:
# info
df.info()

In [None]:
# check the null values
df.isnull().sum()

### Data

There is two type of data:
1) Qualitative Data (nominal data (groups are merely names, no ranking), ordinal data (groups have an order or ranking))

2) Quantitative Data (Discrete data (countable data such as number of vehicles, number of students, etc.),Continuous data (measurable data such as height, weight, etc.))

### Statistics

Till now we have general idea about the dataset.
Now explore the dataset according to statistics concept...

There is two types of statistics:
1) Descriptive statistics

2) Inferential statistics

for now, our focus on Descritive statistics ( on the Dataset).
### *Descriptive statistics* 
provide the description of the data by numerical calculation, table
or in graph.
Descriptive statistics divided into two categories:

**1)** Measures of central tendency (mean,median,mode)

**2)** Measures of variability(spread) (range, inter quartile range, variance, standard deviation)

**Univariate data**: It consist of only one variable ( measures of central tendency and measure of dispersion and by using frequency distribution tables, histograms, pie charts, frequency polygon and bar charts).

**Bivariate data**: This type of data involves two different variables ( correlation and covariance).

**Multivariate data**: When data in involves three or more variables ( correlation matrix and covariance matrix)

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format) # convert scientific notation in numbers
# in panda describe function give us a desciptive statistics
df.describe(include='all')

# read more about scientific notation 
# https://www.pythonpool.com/python-scientific-notation/

### Visualization

**Univariate Categorical Variables**

In [None]:
df_channel=df['Sales Channel'].value_counts().sort_values(ascending=False).reset_index()
df_channel
# renaming the columns
df_channel.columns=['sales channel','count|frequency']

# visualization with bar chart

x=df_channel['sales channel']
y=df_channel['count|frequency']

plt.bar(x,y,color=['#8893b5','#528f8b'])
# plt.xticks(rotation=45)

plt.title("Most Occuring Sales Channel ", size=18)
plt.ylabel('Counts|frequency',size=15)
plt.xlabel('sales channel',size=15)
plt.show()


In [None]:
# Frequency table

df_channel['% of channels occuring|used'] = (df_channel['count|frequency'] /
                                           df_channel['count|frequency'].sum() * 100)

# The below dataframe we can say this is the Frequency Table
pd.DataFrame(df_channel[['sales channel', 'count|frequency', '% of channels occuring|used']])

In [None]:
# visualize with pie chart

df_channel=df['Sales Channel'].value_counts()
label=['Online','Offline']
plt.pie(df_channel,explode=[0.0,0.05],labels=label,shadow=False,autopct='%1.2f%%')
plt.title('Sales Channel')
plt.ylabel('')
plt.legend()
plt.show()

In [None]:
# print the country names with numbering
for i,c in enumerate(df['Country'].unique()):
    print("{0} ) {1}".format(i+1,c))

In [None]:
df_countryX=df['Country'].value_counts().sort_values(ascending=False).reset_index()
# renaming the columns
df_countryX.columns=['country','count|frequency']

# lets select the top 20 countries and visualize them with bar chart

df_country=df_countryX[['country','count|frequency']][0:20]
x1=df_country['country']
y1=df_country['count|frequency']

plt.figure(figsize=(20,6))

sns.barplot(x=x1,y=y1)
plt.xticks(rotation=45)

plt.title("Most Occuring Countries ", size=18)
plt.ylabel('Counts|frequency',size=15)
plt.xlabel('countries',size=15)

plt.show()

In [None]:
# Frequency table

df_countryX['% of counteries occuring'] = (df_countryX['count|frequency'] /
                                           df_countryX['count|frequency'].sum() * 100)

# The below dataframe we can say this is the Frequency Table
df_countryX[['country', 'count|frequency', '% of counteries occuring']][0:20]

**Univariate Quantitative (Quantinuous) Variable**

In [None]:
df_profit=df['Total Profit']

print('min:',df_profit.min())
print('max:',df_profit.max())
print('standard deviation:',df_profit.std())
print('mean:',df_profit.mean())
print('3rd Quartile:',df_profit.quantile(q=0.75))
print('1st Quartile:',df_profit.quantile(q=0.25))
print('median:',df_profit.median()) 
print('IQR (Inter Quartile Range):',df_profit.quantile(q=0.75) - df_profit.quantile(q=0.25)) # IQR = Q3-Q1

In [None]:
sns.histplot(df_profit);
plt.ticklabel_format(style='plain', axis='x')
# The data is right skewed. In right skewed mean > median

In [None]:
# visualize boxplot
sns.boxplot(data=df_profit);
plt.ticklabel_format(style='plain', axis='y')
# box plot gives the five figures min, max, Q1, Q3 and median and it has potential to 
# detect the outliers and it also do comparison

In [None]:
# And the pandas handy package is pandas_profiling it tells about all the variables,misssing values, 
# count, correlation, interaction and samples etc in very less code and very less time.

from pandas_profiling import ProfileReport
prof = ProfileReport(df)
# prof.to_file(output_file='output.html')
prof

This is the basic statistics. In the up coming notebooks we will analyze and visualize the whole
dataset with different Aspects.

Notebook part # 02 coming soon.