In [None]:
# Jupyter notebook to analyze datasets to be used for machine learning training of models, and model scoring

# import Pandas library to manipulate and analyze datasets
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns',None)

In [None]:
# show all Coder workspace environment variables
%env


In [None]:
# load datasets

# telco provider customer churn
df_churn_train = pd.read_csv('./sets/customer_churn_train.csv',sep=",", encoding = "utf-8", low_memory=False)
df_churn_test = pd.read_csv('./sets/customer_churn_test.csv',sep=",", encoding = "utf-8", low_memory=False)

# tumor malignant 
df_cancer_train = pd.read_csv('./sets/cancer_train.csv',sep=",", encoding = "utf-8", low_memory=False)
df_cancer_test = pd.read_csv('./sets/market_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# hospital patient readmit
df_readmit = pd.read_csv('./sets/patient_re_admit_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# boston housing sale data
df_boston_housing = pd.read_csv('./sets/boston_housing_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# customer life-time value
df_ltv = pd.read_csv('./sets/ltv_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# employee retention
df_retention = pd.read_csv('./sets/retention_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# bank loan defaults
df_loan_default = pd.read_csv('./sets/loan_default_dr_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# predictive maintenance for equipment
df_pdm = pd.read_csv('./sets/pdm_machine_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# Vermont Public Safety datasets 
# https://data.vermont.gov/browse?q=public%20safety&sortBy=relevance
# traffic fatalities
df_traffic_fatalities = pd.read_json('https://data.vermont.gov/resource/kurq-9xgq.json')

# dui
df_dui = pd.read_json('https://data.vermont.gov/resource/cgjb-4rbe.json')

# dui with crash
df_dui_crash = pd.read_json('https://data.vermont.gov/resource/qpcy-6kzw.json')

In [None]:
# employee retention analysis

In [None]:
df_retention.shape

In [None]:
list(df_retention)

In [None]:
df_retention.head(5)

In [None]:
df_retention['group_id'].value_counts()

In [None]:
# boston housing data

In [None]:
df_boston_housing.shape

In [None]:
df_boston_housing.head(5)

In [None]:
df_boston_housing['Has_AC'].value_counts()

In [None]:
# vermont dui data

In [None]:
df_dui.head(5)

In [None]:
df_dui['arrestee_gender'].value_counts()

In [None]:
# vermont traffic fatalaties

In [None]:
df_traffic_fatalities.head(5)

In [None]:
df_traffic_fatalities['role'].value_counts()

In [None]:
# customer churn data

In [None]:
df_churn_train.head(5)

In [None]:
# drop columns not relevant for training
df=df_churn_train.drop(['Area Code','Phone'],axis=1)

In [None]:
# create a random train test ( 20 percent ) split
msk = np.random.rand(len(df)) < 0.20
train = df[~msk]
test = df[msk]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
# merge datasets
df_both = pd.concat([train,test])

In [None]:
# num of rows
df_both.shape

In [None]:
# write dataframe to csv
df_both.to_csv('./sets/customer_churn.csv',index=False,sep=',')