# PYTHON TOOLBOX

# 1. Files

### **Basic file handling**

```python
# _OPEN
filename = "hello.txt"
with open(filename, "r") as file: # w --> write, a --> append
    for line in file:
        print(line)
```

Other basic methods: read() (returns one big string), readlines() (returns list of strings), write()


```python
# _LIST_DIR
import os
dir = os.getcwd()
for file in os.listdir(dir): ## Shortcut for dir: "." (current), ".." root of current dir
    if file.endswith(".ipynb"):
        print(os.path.join(dir, file))
```

### Pandas file reading

```python
import pandas as pd

# Read

#CSV
df = pd.read_csv('file.csv', encoding = "utf8")

#EXCEL
df = pd.read_excel('file.xlsx')

# Write
df.to_csv('articles_nyt_2018_01_30.csv', index=None, encoding='utf8')
```

# 2. Types

### **Datetime**

```python
from datetime import datetime

# Date creation
date = datetime(year, month, day).date()

# Date parsing
from datetime import datetime
date_object = datetime.strptime("21 June, 2018", "%d %B, %Y")


# Date parsing with Pandas
import pandas as pd
df['date'] = pd.to_datetime(df['date_str'], format = '%d/%m/%y') # For 18/05/01 format

# Datetime difference
df['time_diff'] = (df['date_1'] - df['date2']).astype('timedelta64[D]')
time_delta = date_1 - date_2

time_delta.total_seconds() # Timedelta in ns
```


### **Handling arrays with numpy**

```python
import numpy as np


#### Array handling ####

x = x[~np.isnan(x)] # Filter NaNs in arrays
X.sort() # Inplace sorting
sorted(X) # New version of X

#### Linspace ####

# Filter NaNs in arrays

np.linspace(start,end,nb_points,endpoint=True)

```

# 3. Pandas


```python
```


```python
df = pd.read_csv('../input/XXX.csv')
df.head()
df.info()

df.column_1.max() # Max of col series
df.column_1.shape # Shape of col
df.column_1.isnull().sum() # Number of null elements in col
len(df.column_1.unique().tolist()) # Number of unique elements in col

# Transformation on col --> Series
col_1_sizes = pd.Series(df.column_1.tolist(), index=df.column_1).apply(lambda x: len(str(x))) 
# Series sorting
col_1_sizes.sort_values(ascending=False).head() 
# Return a Series containing counts of unique values.
df.column_1.value_counts(sort = True).head(5) 



```

**!!! Finding missing data in a DF**
```python
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))
```

**Transformations on DF - Examples:**
```python
## Groupby --> Followed by apply
F_fns = fns.groupby(('preusuel','sexe')).apply(lambda x: x['nombre'].sum())
## Or by filter
F_fns = fns.groupby(('preusuel','sexe')).filter(lambda x: x['nombre'].sum() >=1000)

```


**Column transformations with LOC - Examples:**
```python
df.loc[df['condition'] > 6, ['target_to_tarnsform']] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
```

# 4. Matplotlib

# *! For style* 
```python
import seaborn as sns
```

### 1) Count Labels
```python
sns.countplot(train_df.label_col)


# OR:
train_df['label_col'].value_counts().plot.bar();
```


### 2) BOX/VIOLIN plot
```python
sns.violinplot(data=train_df,x="col_n", y="label")
```


### 3) Correlation within data
```python
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
sns.heatmap(train.corr(),cmap='coolwarm',ax=ax,annot=True,linewidths=2)
```


### 4) Scatter plot
**!! Scatter plot for two df with same columns (ex: train v. test, or two populations)**
```python
# SCATTER PLOT
def plot_feature_scatter(df1, df2, features): 
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(4,4,figsize=(14,14)) #### Here we set it for 16 features, but it can be adjusted

    for feature in features:
        i += 1
        plt.subplot(4,4,i)
        plt.scatter(df1[feature], df2[feature], marker='+')
        plt.xlabel(feature, fontsize=9)
    plt.show();
    
    
features = ['var_0', 'var_1','var_2','var_3', 'var_4', 'var_5', 'var_6', 'var_7', 
           'var_8', 'var_9', 'var_10','var_11','var_12', 'var_13', 'var_14', 'var_15', 
           ]
plot_feature_scatter(train_df[::20],test_df[::20], features)  ## ::20 to do it on a subset of the data --> faster


```

### 5) Density plots 

**!! Comparative density plots for two df with same columns (ex: train v. test, or two populations)**
```python
def plot_feature_distribution(df1, df2, label1, label2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(10,10,figsize=(18,22))

    for feature in features:
        i += 1
        plt.subplot(10,10,i)
        sns.kdeplot(df1[feature], bw=0.5,label=label1)
        sns.kdeplot(df2[feature], bw=0.5,label=label2)
        plt.xlabel(feature, fontsize=9)
        locs, labels = plt.xticks()
        plt.tick_params(axis='x', which='major', labelsize=6, pad=-6)
        plt.tick_params(axis='y', which='major', labelsize=6)
    plt.show();
    
# Example to compare in context of 1-0 classification
t0 = train_df.loc[train_df['target'] == 0]
t1 = train_df.loc[train_df['target'] == 1]
features = train_df.columns.values[2:102]
plot_feature_distribution(t0, t1, '0', '1', features)

# Also possible with Train/Test   

```

### 6) Graphs with date index
```python
# DATE HISTOGRAM
# df_histo has datetimes as index
def display_histo(df_histo, titre):
    plt.figure()
    fig = df_histo.plot(figsize=(15,8))
    
    plt.title(titre, fontsize = 22)
    plt.xlabel('Calendrier',fontsize = 12)
    plt.xticks(rotation=45,ha="right")
    plt.grid(which='major', axis='both', linestyle='--', linewidth=0.5)
    plt.legend(loc=2, prop={'size': 12})

    start, end = fig.get_xlim()
    fig.xaxis.set_ticks(np.arange(start, end, 7))
    fig.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    fig.xaxis.set_tick_params(fig.xaxis.get_ticklabels)
    
    plt.savefig(titre)
```

# 5. Scikit learn

## **A. Transformations**

```python
```

#### **Train-Test split**

```python
### TRAIN_TEST_SPLIT
X_train,X_test,y_train,y_test = train_test_split(X,y)
```

#### Standardization with **StandardScaler**

The StandardScaler assumes your data is normally distributed within each feature --> Very useful

```python
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
```


#### Normalization

Normalizing a vector is the process of scaling vectors to have unit norm. The motivation is to qualify the similarity of any pair of vectors while using dot-product.

```python
from sklearn.preprocessing import Normalizer

scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)
```


## **B. Models**

#### **1) Regression**

```python
###### Elastic Net ######
from sklearn.linear_model import ElasticNetCV

regr = ElasticNetCV(cv=5, random_state=0) # Automatically tests parameters for L1/L2 penalization
regr.fit(X_train, y_train)

pred = regr.predict(X_test)

print(score(X_test, y_test)) # R^2 score


```

#### **2) Classification**


```python

###### Logistic regression ######
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial') # Or 'binomial'
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

clf.score(X_test, y_test) # mean accuracy



###### CatBoost ######
categorical_features_indices = np.where(X.dtypes != np.float)[0]

clf = CatBoostClassifier(random_seed=rnd_state)

clf.fit(X_train, y_train, cat_features=categorical_features_indices)

clf.score(X_val, y_val)
```


#### **3) Unsupervised**

```python
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(X_train)
```

## **C. Parameters**

#### **Grid seach**

```python
#### For KNN, adapt for other models
from sklearn.grid_search import GridSearchCV

params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,param_grid=params)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)
```

## **D. Evaluation**

#### **Regression**

***-MAE***
```python
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_test_pred)
```

***-MSE***
```python
from sklearn.metrics import  mean_squared_error

 mean_squared_error(y_test, y_test_pred)
```

***-R^2 score***
```python
from sklearn.metrics import r2_score

r2_score(y_test, y_test_pred)
```

#### **Classification**

***-AUC***
```python
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_test_pred)
```

***-Confusion matrix***
```python
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_test_pred))
```

# 5. Scikit learn

#### **WordCloud**

```python
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
words = df.text.tolist()
wordcloud = WordCloud(stopwords=STOPWORDS,max_words=500,
                      background_color='white',min_font_size=6,
                      width=3000,collocations=False,
                      height=2500
                     ).generate(words)
plt.figure(1,figsize=(20, 20))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
```



# 6. OTHERS

**FULL_SCRIPT - All in one Feature engineering + CatBoost CV:**

In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from itertools import product, chain
from tqdm import tqdm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

# Any results you write to the current directory are saved as output.

RANDOM_STATE = 0

def get_x(df):
    df['Cabin'].fillna('Unknown', inplace=True)
    df['Embarked'].fillna('Unknown', inplace=True)
    df['Age'].fillna(-1, inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    df['Title'].fillna('na', inplace=True)
    df = df.drop(['Name', 'PassengerId', 'Cabin', 'Embarked'], axis=1)
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
    
    df = df.drop(['Ticket'], axis=1)
    columns = list(df.columns)
    if 'Survived' in columns:
        columns.remove('Survived')
    cat_features = np.where(df[columns].dtypes != np.float)[0]
    return df[columns].values, cat_features


def get_xy(df):
    X, _ = get_x(df)
    y = df['Survived']
    return X, y

#  
def cross_val(X, y, X_test, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    acc = []
    predict = None
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]
        
        X_valid = X[val_ind]
        y_valid = y[val_ind]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc)/n_splits
    
def catboost_GridSearchCV(X, y, X_test, params, cat_features, n_splits=5):
    ps = {'acc':0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        acc = cross_val(X, y, X_test, prms, cat_features, n_splits=5)

        if acc>ps['acc']:
            ps['acc'] = acc
            ps['param'] = prms
    print('Acc: '+str(ps['acc']))
    print('Params: '+str(ps['param']))
    
    return ps['param']
    
    
def main():
    train = pd.read_csv("../input/train.csv")
    test = pd.read_csv("../input/test.csv")
    
    X_train, y_train = get_xy(train)
    X_test, cat_features = get_x(test)
    
    params = {'depth':[2, 3, 4],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg':np.logspace(-20, -19, 3)
    }
    
    param = catboost_GridSearchCV(X_train, y_train, X_test, params, cat_features)

    clf = CatBoostClassifier(iterations=2500,
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            eval_metric = 'Accuracy',
                            leaf_estimation_iterations = 10,
                            use_best_model=True
    )
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                        y_train, 
                                                        shuffle=True,
                                                        random_state=RANDOM_STATE,
                                                        train_size=0.8,
                                                        stratify=y_train
    )
    clf.fit(X_train, 
            y_train,
            cat_features=cat_features,
            logging_level='Silent',
            eval_set=(X_valid, y_valid)
    )
    
    sub = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':np.array(clf.predict(X_test)).astype(int)})
    sub.to_csv('cat_sub_1.csv',index=False)
    
if __name__=='__main__':
    main()

FileNotFoundError: File b'../input/train.csv' does not exist

**FULL_SCRIPT - Date transformation EXPEDIA:**

In [36]:
# Function to convert date object into relevant attributes
def convert_date_into_days(df):
    df['srch_ci'] = pd.to_datetime(df['srch_ci'])
    df['srch_co'] = pd.to_datetime(df['srch_co'])
    df['date_time'] = pd.to_datetime(df['date_time'])
    
    df['stay_dur'] = (df['srch_co'] - df['srch_ci']).astype('timedelta64[D]')
    df['no_of_days_bet_booking'] = (df['srch_ci'] - df['date_time']).astype('timedelta64[D]')
    
    # For hotel check-in
    # Month, Year, Day
    df['Cin_day'] = df["srch_ci"].apply(lambda x: x.day)
    df['Cin_month'] = df["srch_ci"].apply(lambda x: x.month)
    df['Cin_year'] = df["srch_ci"].apply(lambda x: x.year)

**FULL_SCRIPT: CATBOOST Classifier for Titanic:**

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier, cv

rnd_state = 42

# read data
df = pd.read_csv('../input/train.csv', index_col='PassengerId')

df.fillna(-999, inplace=True)

X = df.drop('Survived', axis=1) 
y = df.Survived

# make train val split to try out-of-the-box
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rnd_state)

categorical_features_indices = np.where(X.dtypes != np.float)[0]
clf = CatBoostClassifier(random_seed=rnd_state, custom_metric='Accuracy')
clf.fit(X_train, y_train, cat_features=categorical_features_indices)
clf.score(X_val, y_val)

# Submission 1: catboost submission with all training data and early stopping on Accuracy
test_df = pd.read_csv('../input/test.csv', index_col='PassengerId')
test_df.fillna(-999, inplace=True)
clf_od = CatBoostClassifier(random_seed=rnd_state, od_type='Iter', od_wait=20, eval_metric='Accuracy')
clf_od.fit(X, y, cat_features=categorical_features_indices)

## cross validation score
cv_data = cv(Pool(X, label=y, cat_features=categorical_features_indices), clf_od.get_params())
print(f"Best validation accuracy score: {np.max(cv_data['Accuracy_test_avg'])}±{cv_data['Accuracy_test_stddev'][np.argmax(cv_data['Accuracy_test_avg'])]} on step {np.argmax(cv_data['Accuracy_test_avg'])}")

submission = pd.DataFrame()
submission['PassengerId'] = test_df.index
submission['Survived'] = clf_od.predict(test_df).astype('int')
submission.to_csv('submission_early_stopping.csv', index=False)

FileNotFoundError: File b'../input/train.csv' does not exist

**FULL_SCRIPT - PCA & correlation circle:**

In [None]:
df = ...

# Preparation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)

# PCA
from sklearn.decomposition import PCA
n_components = 2
pca = PCA(n_components=n_components)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

# PCA projection plot
plt.figure(figsize=(10,6))
plt.scatter(x_pca[:,0], x_pca[:,1], c=cancer['target'], cmap='plasma')
plt.xlabel = "First Principal Component"
plt.ylabel = "Second Principal component"
plt.show()

# Correlation circle

# Do a scree plot
ind = np.arange(0, n_components)
(fig, ax) = plt.subplots(figsize=(8, 6))
sns.pointplot(x=ind, y=pca.explained_variance_ratio_)
ax.set_title('Scree plot')
ax.set_xticks(ind)
ax.set_xticklabels(ind)
ax.set_xlabel('Component Number')
ax.set_ylabel('Explained Variance')
plt.show()
 
g = sns.lmplot(
    'PC1',
    'PC2',
    hue='species',
    data=df,
    fit_reg=False,
    scatter=True,
    size=7,
    )
 
plt.show()
 
# Plot a variable factor map for the first two dimensions.
(fig, ax) = plt.subplots(figsize=(12, 12))
for i in range(0, len(pca.components_)):
    ax.arrow(0, 0,  # Start the arrow at the origin
             pca.components_[0, i], pca.components_[1, i],  # 0 and 1 correspond to dimension 1 and 2
             head_width=0.1,head_length=0.1)
    plt.text(pca.components_[0, i] + 0.05, pca.components_[1, i] + 0.05, df.columns.values[i])

an = np.linspace(0, 2 * np.pi, 100)  # Add a unit circle for scale
plt.plot(np.cos(an), np.sin(an))
plt.axis('equal')
ax.set_title('Variable factor map')
plt.show()