# Install a package command

In [None]:
!pip install <package_name>

# Import Packages commands

In [8]:
# Data Analysis packages
import pandas as pd
import pandas_profiling 
import numpy as np


# Data Visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Other useful packages
from datetime import datetime
import warnings
import os
from tqdm import tqdm, tqdm_notebook
from subprocess import check_output
from pydotplus.graphviz import graph_from_dot_data

# Sklearn API
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
# Classification Algo Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
# Regression Algo Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score





# Stats API
import scipy.stats as stats
import statsmodels.formula.api as smf

plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 500) # OR pd.options.display.max_rows = 500
pd.set_option('display.max_columns', 500) # OR pd.options.display.max_columns = 500



# OS package commands

In [10]:
print(os.listdir("/Users/rajkgupta/DATASCIENCE"))

['Statistics', 'ExpediaGroupDataScienceAcademy', 'gitrepos', 'UdemyCourses', '.DS_Store', 'PythonandS3', 'Tableau', 'NaturalLanguageProcessing', 'Computer Vision', 'Python', 'Data Visualization', 'R', 'Fee Receipt', 'MachineLearning', 'Numpy', 'EDA Project', 'PythonandStatistics', 'PyCharmProjects', 'INSAIDGCDProgramSyllabus', 'YoutubeVideos', 'AboutDataScience', 'Useful EDA commands.ipynb', 'WebScrappingScript', 'PythonForFinance', 'PythonDSPresentations', '.ipynb_checkpoints', 'PythonPractice', 'Deep Learning', 'Assignments - Numpy and Pandas', 'ODSCMeetup', 'Data', 'PythonandSparkforBigData', 'Pandas', 'AnalyticsLab', 'Career Guide']


# Check Missing Values

In [None]:
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

# Check Correlation

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.corr(),cmap='Blues',annot=False) 

# Heat Map with annotations

In [None]:
#Quality correlation matrix
k = 12 #number of variables for heatmap
cols = df.corr().nlargest(k, 'quality')['quality'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

# To Check Outliers

In [None]:
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()

# To check distribution-Skewness

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot(df[l[i]],kde=True) 

# Progress Apply for showing progress bar

In [None]:
tqdm_notebook().pandas()
movies.progress_apply(lambda x: calculcateNewRating(x['Genre'],x['Rating']),axis=1)

# create UDF's

In [11]:
# Create Data audit Report for continuous variables
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [16]:
# Create Data audit Report for categorical variables
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [15]:
# Missing value imputation for categorical and continuous variables
def missing_imputation(x, stats = 'mean'):
    if (x.dtypes == 'float64') | (x.dtypes == 'int64'):
        x = x.fillna(x.mean()) if stats == 'mean' else x.fillna(x.median())
    else:
        x = x.fillna(x.mode())
    return x

In [14]:
# An utility function to create dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

# Pandas Profiling

In [None]:
report = pandas_profiling.ProfileReport(cars)
report.to_file(output_file = 'profilereport.html')

# sepearate categorical and numerical features

In [None]:
# seperate categorical and continuous variables
cars_conti_vars = cars.loc[:, (cars.dtypes == 'float64') | (cars.dtypes == 'int64')]
cars_cat_vars = cars.loc[:, (cars.dtypes == 'object')]

# Simper way of doing:
# cars_conti_vars = cars.select_dtypes(include = ['float64', 'int64'])
# car_sales_cat = cars.select_dtypes(include = ['object'])
cars_conti_vars.apply(continuous_var_summary).T.round(1)
cars_cat_vars.apply(categorical_var_summary).T

# outlier treatment

In [None]:
cars_conti_vars = cars_conti_vars.apply(lambda x: x.clip(lower = x.dropna().quantile(0.01), 
                                                         upper = x.quantile(0.99)))

# missing value treatment

In [None]:
cars_conti_vars = cars_conti_vars.apply(missing_imputation)
cars_cat_vars = cars_cat_vars.apply(missing_imputation)