## *Free Code*

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([['A', 'Z', 82, 5, 10, 26, 99, '2002-09-08'],  ['B', 'Y', 103, 25, 15, 21, 99, '2202-06-08'],
                   ['A', 'Y', 10, 13, np.nan, 25, 99, '2012-06-08'], ['B', 'Z', 100, 25, np.nan, 20, 99, '2002-09-08'],
                   ['C', 'Y', 14, 15, 19, np.nan, 99, '2102-09-08'], ['D', 'Z', 1699, 18, 11, np.nan, 98, '2002-09-08'],
                   ['C', 'Z', 93, 15, 17, np.nan, 99, '2002-06-08'], ['D', 'Y', 130, 19, 13, np.nan, 98, '2002-09-08']],
                  columns=['Team', 'League', 'Round 1', 'Round 2', 'Round 3', 'Round 4', 'Final', 'Date'])
df.head(1)

Unnamed: 0,Team,League,Round 1,Round 2,Round 3,Round 4,Final,Date
0,A,Z,82,5,10.0,26.0,99,2002-09-08


### Create Bool Col

In [None]:
np.where(df['Round 1'] > 15, True, False)

NameError: ignored

### Working with Dates

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df.dtypes

Team               object
League             object
Round 1             int64
Round 2             int64
Round 3           float64
Round 4           float64
Final               int64
Date       datetime64[ns]
dtype: object

In [None]:
df['Date'][0].month

9

In [None]:
# Filter a date based on the month
df[df['Date'].dt.strftime('%m') == '06'] #both arg in the cond are str type
df[df['Date'].dt.month == 6]             #both arg in the cond are int type

Unnamed: 0,Team,League,Round 1,Round 2,Round 3,Round 4,Final,Date
1,B,Y,103,25,15.0,21.0,99,2202-06-08
2,A,Y,10,13,,25.0,99,2012-06-08
6,C,Z,93,15,17.0,,99,2002-06-08


In [None]:
def expand_strDate(df, col_date):
  datetime_format = data[col_date].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
  df['year']  = datetime_format[col_date].apply(lambda x: x.year)
  df['month'] = datetime_format[col_date].apply(lambda x: x.month)
  df['day']   = datetime_format[col_date].apply(lambda x: x.day)
  df['hour']  = datetime_format[col_date].apply(lambda x: x.hour)
  return data

### Indexing - Slacing

In [None]:
df.loc[df['Round 1'] < 20, 'Team'] = 'Novo'
df[['Team','Round 1']]

Unnamed: 0,Team,Round 1
0,A,82
1,B,103
2,Novo,10
3,B,100
4,Novo,14
5,D,1699
6,C,93
7,D,130


## *Prepocessing*

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([['A', 'Z', 82, 5, 10, 26, 99, '2002-09-08'],  ['B', 'Y', 103, 25, 15, 21, 99, '2202-06-08'],
                   ['A', 'Y', 10, 13, np.nan, 25, 99, '2012-06-08'], ['B', 'Z', 100, 25, np.nan, 20, 99, '2002-09-08'],
                   ['C', 'Y', 14, 15, 19, np.nan, 99, '2102-09-08'], ['D', 'Z', 1699, 1, 11, np.nan, 98, '2002-09-08'],
                   ['C', 'Z', 93, 15, 17, np.nan, 99, '2002-06-08'], ['D', 'Z', 130, 19, 13, np.nan, 98, '2002-09-08']],
                  columns=['Team', 'League', 'Round 1', 'Round 2', 'Round 3', 'Round 4', 'Final', 'Date'])
df.head(1)

Unnamed: 0,Team,League,Round 1,Round 2,Round 3,Round 4,Final,Date
0,A,Z,82,5,10.0,26.0,99,2002-09-08


### Missing values - NaN

In [None]:
# show null values
def null_na_values(data):
  for col in data.columns.to_list():
    print(f'{col} --> Qty: {data[col].isna().sum()}  --  %: {(data[col].isna().sum()/data.shape[0])*100}')

# or juts: `df.isnull().sum()`

In [None]:
# populate the missing values

df.fillna(df.median()) #can use any other stats or a scalar

In [None]:
# drop all cols with NaN

df.dropna(axis=1) #`axis=0` for rows

### Outliers

In [None]:
def outliers_IQR(data, col):

  lower_quartile = data[col].quantile(0.25)
  upper_quartile = data[col].quantile(0.75)
  IQR = upper_quartile - lower_quartile
  outlier_thresh = 1.5 * IQR

  return data[data[col].between((lower_quartile - outlier_thresh), (upper_quartile + outlier_thresh))].reset_index()

outliers_IQR(df, 'Round 1')

In [None]:
def outliers_zsocre(data, col, std = 3):
  import numpy as np
  import scipy.stats as stats

  z = np.abs(stats.zscore(data[col]))

  return data[z <= std].reset_index()

outliers_zsocre(df, 'Round 1')

### Delete Constant and Duplicate Columns

In [None]:
# constant columns (only numeric)

colRemove = []
for col in df.columns:
  if not df[col].dtype==np.object:
    if df[col].std() <= 0.5:
      colRemove.append(col)

df.drop(colRemove, axis=1)

In [None]:
# remove duplicate columns

def del_duplicate_col(data):
  col_remove, columns = [], data.columns
  for i in range(len(columns)-1):
    v = data[columns[i]].values
    for j in range(i+1,len(columns)):
      if np.array_equal(v,data[columns[j]].values):
        col_remove.append(columns[j])
  if len(col_remove)>0:
    return data.drop(col_remove, axis=1, inplace=True), col_remove
  else:
    return data, 'None'

### Label and One Hot Encoder

In [None]:
# Label Encoder
##  has a inverse: `label.inverse_transform(data)`

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df.loc[:,'TeamLE'] = label.fit_transform(df.loc[:,'Team'])
df.loc[:,'LeagueLE'] = label.fit_transform(df.loc[:,'League'])
df

Unnamed: 0,Team,League,Round 1,Round 2,Round 3,Round 4,Final,Date,TeamLE,LeagueLE
0,A,Z,82,5,10.0,26.0,99,2002-09-08,0,1
1,B,Y,103,25,15.0,21.0,99,2202-06-08,1,0
2,A,Y,10,13,,25.0,99,2012-06-08,0,0
3,B,Z,100,25,,20.0,99,2002-09-08,1,1
4,C,Y,14,15,19.0,,99,2102-09-08,2,0
5,D,Z,1699,1,11.0,,98,2002-09-08,3,1
6,C,Z,93,15,17.0,,99,2002-06-08,2,1
7,D,Z,130,19,13.0,,98,2002-09-08,3,1


In [None]:
# One Hot Encoder -- or using `pd.get_dummies()`
##  has a inverse: `enc.inverse_transform(data)`
##  to concatenate: `df.join(dummies_vars)`

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
#enc.fit_transform(df.loc[:,'TeamLE':'LeagueLE']).toarray()
enc.fit_transform(df.loc[:,'TeamLE'].values.reshape(-1,1)).toarray() #or `df[['TeamLE']]`

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [None]:
pd.get_dummies(df, columns=['TeamLE','LeagueLE'])

Unnamed: 0,Team,League,Round 1,Round 2,Round 3,Round 4,Final,Date,TeamLE_0,TeamLE_1,TeamLE_2,TeamLE_3,LeagueLE_0,LeagueLE_1
0,A,Z,82,5,10.0,26.0,99,2002-09-08,1,0,0,0,0,1
1,B,Y,103,25,15.0,21.0,99,2202-06-08,0,1,0,0,1,0
2,A,Y,10,13,,25.0,99,2012-06-08,1,0,0,0,1,0
3,B,Z,100,25,,20.0,99,2002-09-08,0,1,0,0,0,1
4,C,Y,14,15,19.0,,99,2102-09-08,0,0,1,0,1,0
5,D,Z,1699,1,11.0,,98,2002-09-08,0,0,0,1,0,1
6,C,Z,93,15,17.0,,99,2002-06-08,0,0,1,0,0,1
7,D,Z,130,19,13.0,,98,2002-09-08,0,0,0,1,0,1


### Binarization

In [None]:
# Binarization
##  Set a threshold -> if bigger is 1, otherwise is 0

threshold = 99
df.loc[:,'Round 1'].apply(lambda x: 1 if x>=threshold else 0)

## *EDA - Exploratory Data Analysis*

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([['A', 'Z', 10, np.nan, 10, 26, 98], ['B', 'Y', np.nan, 25, 15, 21, 89],
                   ['A', 'Y', 10, 13, np.nan, 25, 99], ['B', 'Z', 14, 25, np.nan, 20, 88],
                   ['C', 'Y', 12, 15, 19, np.nan, 90], ['D', 'Z', 10, 18, 11, np.nan, 97],
                   ['C', 'Z', 12, 15, 17, np.nan, 80], ['D', 'Y', 10, 19, 13, np.nan, 98]],
                  columns=['Team', 'League', 'Round 1', 'Round 2', 'Round 3', 'Round 4', 'Final'])

In [None]:
print("Num Rows: ", df.shape[0])
print("Num Columns: ", df.shape[1])
print("\nVariable: \n", df.columns.tolist())
print("\nNull Values: \n" , df.isnull().sum())
print("\nUnique Values: \n", df.nunique())

### Unique Values

In [None]:
def unique_values(data, col_unique=None):
  if col_unique==None:
    for col in data.columns.to_list():
      print(f'{col} --> {data[col].unique()}\n')
  else:
    print(f'{col_unique} --> {data[col_unique].unique()}\n')

### Data Distribution

In [None]:
# Numeric features
df.hist(figsize=(14,14), xrot=45) #graphic
df.describe() #statistics summary

In [None]:
# Categorical features
import seaborn as sns
import matplotlib.pyplot as plt

#statistics summary
display(df.describe(include='object'))

#count graphic
for column in df.select_dtypes(include='object'):
    if df[column].nunique() < 10:
        sns.countplot(y=column, data=df)
        plt.show()

#boxplot graphic
TARGET_FEATURE = 'Final'
for column in df.select_dtypes(include='object'):
    if df[column].nunique() < 10:
        sns.boxplot(y=column, x=TARGET_FEATURE, data=df)
        plt.show()

In [None]:
# Numeric(x) x Categorical(y) features
for column in df.select_dtypes(include='object'):
  if df[column].nunique() < 10:
    display(df.groupby(column).mean()) # any metric (median, min, std, ...)

### Pivot Table

In [None]:
df.pivot_table(index = ['Team'], columns = ['League'], values = 'Final')

League,Y,Z
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
A,99,98
B,89,88
C,90,80
D,98,97
