In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['nyc-taxi-trip-duration', 'titanic', 'fifa19', 'usstates', 'housing']


# 1. Automatic Feature Creation using featuretools:

In [None]:
import featuretools as ft

In [None]:
data = ft.demo.load_mock_customer()

In [None]:
customers_df = data["customers"]

In [None]:
customers_df.head()

In [None]:
sessions_df = data['sessions']

In [None]:
sessions_df.head(5)

In [None]:
transactions_df = data["transactions"]

In [None]:
transactions_df.head(5)

In [None]:
# Create new entityset
es = ft.EntitySet(id = 'customers')

In [None]:
# Create an entity from the customers dataframe

es = es.entity_from_dataframe(entity_id = 'customers', dataframe = customers_df, 
                              index = 'customer_id', time_index = 'join_date' ,variable_types =  {"zip_code": ft.variable_types.ZIPCode})

In [None]:
es

In [None]:
es = es.entity_from_dataframe(entity_id="transactions",
                                 dataframe=transactions_df,
                                 index="transaction_id",
                               time_index="transaction_time",
                               variable_types={"product_id": ft.variable_types.Categorical})

In [None]:
ft.variable_types.ALL_VARIABLE_TYPES

In [None]:
es

In [None]:
es = es.entity_from_dataframe(entity_id="sessions",
            dataframe=sessions_df,
            index="session_id", time_index = 'session_start')

In [None]:
es

In [None]:


cust_relationship = ft.Relationship(es["customers"]["customer_id"],
                       es["sessions"]["customer_id"])

# Add the relationship to the entity set
es = es.add_relationship(cust_relationship)


In [None]:

sess_relationship = ft.Relationship(es["sessions"]["session_id"],
                       es["transactions"]["session_id"])

# Add the relationship to the entity set
es = es.add_relationship(sess_relationship)



In [None]:
es

In [None]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                        target_entity="customers",max_depth = 3)

In [None]:
feature_matrix

In [None]:
len(feature_defs)

In [None]:
feature_defs

In [None]:
# Lets talk about categorical features 
sessions_df.head()


In [None]:
pd.get_dummies(sessions_df['device'],drop_first=True).head()

# 2. Handling Categorical Features: Label/Binary/Hashing and Target/Mean Encoding

## Ordinal Encoding

In [None]:
df = pd.DataFrame(
       [[ 'low', 'London'], [ 'medium', 'New York'], [ 'high', 'Dubai']],
       columns=['Temperature', 'City'])


In [None]:
df

In [None]:
map_dict = {'low':0,'medium':1,'high':2}
def map_values(x):
    return map_dict[x]
df['Temperature_oe'] = df['Temperature'].apply(lambda x: map_values(x))

In [None]:
df

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
# create a labelencoder object
le = LabelEncoder()
# fit and transform on the data
sessions_df['device_le'] = le.fit_transform(sessions_df['device'])
sessions_df.head()

In [None]:
sessions_df.head()

## Binary Encoder

In [None]:

players = pd.read_csv("../input/fifa19/data.csv")

In [None]:
len(players.Club.unique())

In [None]:

from category_encoders.binary import BinaryEncoder
# create a Binaryencoder object
be = BinaryEncoder(cols = ['Club'],)
# fit and transform on the data
players = be.fit_transform(players)

In [None]:
players.head()

## Hashing Encoder

In [None]:

players = pd.read_csv("../input/fifa19/data.csv")

from category_encoders.hashing import HashingEncoder
# create a HashingEncoder object
be = HashingEncoder(cols = ['Club'])
# fit and transform on the data
players = be.fit_transform(players)

In [None]:
players.head()

## Target/Mean Encoding

In [8]:
train = pd.read_csv("../input/titanic/train.csv")

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# taken from https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b
from sklearn import base
from sklearn.model_selection import KFold

class KFoldTargetEncoderTrain(base.BaseEstimator,
                               base.TransformerMixin):
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True, random_state=2019)
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)
                                     [self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)
        if self.verbosity:
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,                    
                   np.corrcoef(X[self.targetName].values,
                               encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

In [11]:
targetc = KFoldTargetEncoderTrain('Pclass','Survived',n_fold=5)
new_train = targetc.fit_transform(train)

Correlation between the new feature, Pclass_Kfold_Target_Enc and, Survived is 0.33349480268464116.


In [12]:
new_train[['Pclass_Kfold_Target_Enc','Pclass']].head()

Unnamed: 0,Pclass_Kfold_Target_Enc,Pclass
0,0.242268,3
1,0.642045,1
2,0.248756,3
3,0.640244,1
4,0.242268,3


# 3. How best to use Latitude and Longitude features - Part 1:

In [None]:
train = pd.read_csv("../input/nyc-taxi-trip-duration/train.csv")

In [None]:
train = train.sample(500)

In [None]:
def haversine_array(lat1, lng1, lat2, lng2): 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    AVG_EARTH_RADIUS = 6371 # in km 
    lat = lat2 - lat1 
    lng = lng2 - lng1 
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) *      np.sin(lng * 0.5) ** 2 
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d)) 
    return h

In [None]:
train['haversine_distance'] = train.apply(lambda x: haversine_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [None]:
def dummy_manhattan_distance(lat1, lng1, lat2, lng2): 
    a = haversine_array(lat1, lng1, lat1, lng2) 
    b = haversine_array(lat1, lng1, lat2, lng1) 
    return a + b

In [None]:
train['manhattan_distance'] = train.apply(lambda x: dummy_manhattan_distance(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [None]:
def bearing_array(lat1, lng1, lat2, lng2): 
    AVG_EARTH_RADIUS = 6371 # in km 
    lng_delta_rad = np.radians(lng2 - lng1) 
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2)) 
    y = np.sin(lng_delta_rad) * np.cos(lat2) 
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad) 
    return np.degrees(np.arctan2(y, x))

In [None]:
train['bearing'] = train.apply(lambda x: bearing_array(x['pickup_latitude'], x['pickup_longitude'], x['dropoff_latitude'], x['dropoff_longitude']),axis=1)

In [None]:
train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2 
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2

In [None]:
train.head()

# log feature transformation

In [None]:
import plotly_express as px


In [None]:
px.histogram(train,x='trip_duration')

In [None]:
train['log_trip_duration'] = train['trip_duration'].apply(lambda x: np.log(1+x))

In [None]:
px.histogram(train,x='log_trip_duration')