# Common Data Analysis in Python

In [14]:
import sys
import logging

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create calenda data

In [2]:

df_dates = pd.DataFrame()

df_dates['date'] = pd.date_range('2015-01-01', periods = 72, freq ='D') 
  
# Create features for year, month, day, hour, and minute 
df_dates['year'] = df_dates['date'].dt.year 
df_dates['month'] = df_dates['date'].dt.month 
df_dates['day'] = df_dates['date'].dt.day 
df_dates['weekday_name'] = df_dates['date'].dt.weekday_name
df_dates['month_end'] = df_dates['date'] + pd.offsets.MonthEnd(0)
  
# Print the dates divided into features 
df_dates.head(10) 



Unnamed: 0,date,year,month,day,weekday_name,month_end
0,2015-01-01,2015,1,1,Thursday,2015-01-31
1,2015-01-02,2015,1,2,Friday,2015-01-31
2,2015-01-03,2015,1,3,Saturday,2015-01-31
3,2015-01-04,2015,1,4,Sunday,2015-01-31
4,2015-01-05,2015,1,5,Monday,2015-01-31
5,2015-01-06,2015,1,6,Tuesday,2015-01-31
6,2015-01-07,2015,1,7,Wednesday,2015-01-31
7,2015-01-08,2015,1,8,Thursday,2015-01-31
8,2015-01-09,2015,1,9,Friday,2015-01-31
9,2015-01-10,2015,1,10,Saturday,2015-01-31


## Load data

In [34]:
df_base = sns.load_dataset('titanic')

In [11]:
df_base.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Exploring Analysis

In [22]:
# rows and columns
df_base.shape

(891, 15)

In [23]:
# samples
df_base.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [35]:
# rename columns
df_base = df_base.rename({'who':'man woman','alone':'alone_f'},axis='columns')
df_base.columns


Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'man woman', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone_f'],
      dtype='object')

In [36]:

# replace all column names space with '_'
df_base.columns = df_base.columns.str.replace(' ', '_')

# add prefix or suffix to columns
#df_base.add_prefix('x_')
#df_base.add_suffix('x_')

df_base.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'man_woman', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone_f'],
      dtype='object')

In [40]:
# columns by type
df_base.dtypes


survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
man_woman        object
adult_male         bool
deck           category
embark_town      object
alive            object
alone_f            bool
dtype: object

In [41]:

# select columns by type
df_base.select_dtypes(include='number').head()
df_base.select_dtypes(include='object').head()
df_base.select_dtypes(include=['number', 'object', 'category', 'datetime']).head()
df_base.select_dtypes(exclude='number').head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [None]:
#convert string to numeric and fill 0 if errors NaN
#pd.to_numeric(df_base.pclass,errors='coerce').fillna(0)
#df_base.dtypes

In [44]:
# build a data frame read csv files
# from glob import glob
# stock_files = sorted(glob('data/stocks*.csv'))
# stock_files
# pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

In [6]:
import pandas_profiling

In [7]:
profile = df_base.profile_report()

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [None]:
profile

## Data manupulation

In [18]:
# select rows and columns
df_base.isna().mean()

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [55]:
# select rows
df_base.embark_town.unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [60]:
# filter rows based on selection

df_base[(df_base.embark_town=='Queenstown') |
       (df_base.embark_town=='Southampton')].head()

df_base[df_base.embark_town.isin(['Queenstown','Southampton'])].head()
df_base[~df_base.embark_town.isin(['Queenstown','Southampton'])].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,man_woman,adult_male,deck,embark_town,alive,alone_f
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
19,1,3,female,,0,0,7.225,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True
30,0,1,male,40.0,0,0,27.7208,C,First,man,True,,Cherbourg,no,True


In [62]:
# Filter a DataFrame by largest categories
counts = df_base.embarked.value_counts()
counts

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [63]:
counts.nlargest(1)

S    644
Name: embarked, dtype: int64

In [64]:
counts.nlargest(1).index

Index(['S'], dtype='object')

In [66]:
df_base[df_base.embarked.isin(counts.nlargest(1).index)].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,man_woman,adult_male,deck,embark_town,alive,alone_f
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True


## Ploting

## Define target

## Variable treatment, missing and capping

In [18]:
# Calculate % of missing values in each column
df_base.isna().mean()


survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [20]:
# Drop columns in which more than 70% of values are missing:
df_rm_missing = df_base.dropna(thresh=len(df_base)*(1-0.7), axis='columns')

In [21]:
df_rm_missing.isna().mean()

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

## Feature engineering

In [45]:
# add new col


## Train models

In [None]:
# partition dataframe
train = df_base.sample(frac=0.75, random_state=1234)
test = df_base.drop(train.index)

In [19]:
len(df_base)

891

## Hyper-parameters Tuning

## Model comparison and selection

## Model interpretation

## Artifacts exports

## Tesing

In [30]:
#Need to create a DataFrame for testing?

pd.util.testing.makeDataFrame() # contains random values
pd.util.testing.makeMissingDataframe()#some values missing
pd.util.testing.makeTimeDataFrame() # has DateTimeIndex
pd.util.testing.makeMixedDataFrame() # mixed data types#Python #pandas 

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07
