In [1]:
# Jupyter notebook to analyze datasets to be used for machine learning training of models, and model scoring

# import Pandas library to manipulate and analyze datasets
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',None)

In [24]:
# load datasets

# telco provider customer churn
df_churn_train = pd.read_csv('./sets/customer_churn_train.csv',sep=",", encoding = "utf-8", low_memory=False)
df_churn_test = pd.read_csv('./sets/customer_churn_test.csv',sep=",", encoding = "utf-8", low_memory=False)

# tumor malignant 
df_cancer_train = pd.read_csv('./sets/cancer_train.csv',sep=",", encoding = "utf-8", low_memory=False)
df_cancer_test = pd.read_csv('./sets/market_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# hospital patient readmit
df_readmit = pd.read_csv('./sets/patient_re_admit_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# boston housing sale data
df_boston_housing = pd.read_csv('./sets/boston_housing_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# customer life-time value
df_ltv = pd.read_csv('./sets/ltv_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# employee retention
df_retention = pd.read_csv('./sets/retention_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# bank loan defaults
df_loan_default = pd.read_csv('./sets/loan_default_dr_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# predictive maintenance for equipment
df_pdm = pd.read_csv('./sets/pdm_machine_train.csv',sep=",", encoding = "utf-8", low_memory=False)

# Vermont Public Safety datasets 
# https://data.vermont.gov/browse?q=public%20safety&sortBy=relevance
# traffic fatalities
df_traffic_fatalities = pd.read_json('https://data.vermont.gov/resource/kurq-9xgq.json')

# dui
df_dui = pd.read_json('https://data.vermont.gov/resource/cgjb-4rbe.json')

# dui with crash
df_dui_crash = pd.read_json('https://data.vermont.gov/resource/qpcy-6kzw.json')

In [None]:
# employee retention analysis

In [25]:
df_retention.shape

(11998, 10)

In [27]:
list(df_retention)

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'group_id',
 'salary']

In [33]:
df_retention.head(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,group_id,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [29]:
df_retention['group_id'].value_counts()

sales          3245
technical      2244
support        1821
IT              976
RandD           695
product_mng     686
marketing       672
accounting      622
hr              601
management      436
Name: group_id, dtype: int64

In [None]:
# boston housing data

In [31]:
df_boston_housing.shape

(29998, 33)

In [32]:
df_boston_housing.head(5)

Unnamed: 0,PID,ST_NUM,ST_NAME,ST_NAME_SUF,ZIPCODE,Assessed_Value,Lot_Area,Gross_Area,Living_Area,Owner_Occupied,Year_Built,Number_of_Floors,Total_Number_of_Rooms,Number_of_Bedrooms,Number_of_Full_Baths,Number_of_Half_Baths,Number_of_Kitchens,Has_AC,Number_of_Fireplaces,Year_Since_Remodel_or_Build,Year_Remodeled,Structure_Type,Building_Style,Roof_Type,Exterior_Finish,Main_Bathroom_Style,Main_Kitchen_Style,Heating_type,Exterior_Condition,Overall_Condition,Interior_Condition,Interior_Finish,View
0,2001658000_,43,STRATFORD,ST,02132_,963300,20897,7396,3887,1,1910,2.0,13,4,1,1,1,1,1,3,2014,Residential,Colonial,Hip,Vinyl,Modern,Modern,Forced Air,Average,Average,Average,Normal,Average
1,2001659000_,47,STRATFORD,ST,02132_,915600,9856,6730,3566,1,1910,2.0,9,4,1,0,1,0,2,25,1992,Residential,Colonial,Hip,Wood Shake,Modern,Modern,Hot Water,Average,Average,Good,Normal,Average
2,2001660000_,53,STRATFORD,ST,02132_,911400,8415,6442,2843,1,1910,2.0,11,6,2,1,1,1,1,6,2011,Residential,Colonial,Gable,Wood Shake,Semi-Modern,Modern,Hot Water,Good,Good,Good,Normal,Average
3,2001661000_,57,STRATFORD,ST,02132_,862500,8333,6020,3558,1,1910,2.5,11,6,2,0,1,0,1,27,1990,Residential,Colonial,Hip,Wood Shake,Semi-Modern,Modern,Hot Water,Average,Average,Good,Normal,Average
4,2001662000_,61,STRATFORD,ST,02132_,789300,8232,5574,2978,1,1910,2.0,9,5,2,0,1,0,1,14,2003,Residential,Colonial,Gable,Wood Shake,Modern,Modern,Hot Water,Average,Average,Good,Normal,Average


In [35]:
df_boston_housing['Has_AC'].value_counts()

0    24366
1     5632
Name: Has_AC, dtype: int64

In [None]:
# vermont dui data

In [36]:
df_dui.head(5)

Unnamed: 0,incident_id,incident_date_time,agency,incident_type,town_of_incident,county_of_incident,county_of_arrest,arrest_statutes,arresting_agency,arrestee_gender,arrestee_age,count
0,16C104877,2016-09-04 18:19:00,VSP: Rutland,Drive/Operate Under Influence,Mendon,Rutland,Rutland,23V1201A1#3,VSP: Rutland,Male,73,1
1,16B103451,2016-09-04 01:58:00,VSP: Saint Johnsbury,Drive/Operate Under Influence|DLS|Violate Cond...,Lyndon,Caledonia,Caledonia,23V1201,VSP: Saint Johnsbury,Male,22,1
2,16A303786,2016-09-03 19:51:00,VSP: Middlesex,Accident - Injury|Drive/Operate Under Influence,East Montpelier,Washington,Washington,23V1201,VSP: Middlesex,Male,57,1
3,16B103435,2016-09-03 00:33:00,VSP: Saint Johnsbury,Drive/Operate Under Influence,Lyndon,Caledonia,Caledonia,23V1201,VSP: Saint Johnsbury,Female,25,1
4,16B103419,2016-09-02 01:56:00,VSP: Saint Johnsbury,Drive/Operate Under Influence,Lyndon,Caledonia,Caledonia,23V1201A1#1/R,VSP: Saint Johnsbury,Male,22,1


In [38]:
df_dui['arrestee_gender'].value_counts()

Male      725
Female    275
Name: arrestee_gender, dtype: int64

In [None]:
# vermont traffic fatalaties

In [39]:
df_traffic_fatalities.head(5)

Unnamed: 0,incident_no,agency,crash_date,crash_city,crash_day,year,conditions,crash_circumstances,role,crash_injury,seatbelt,gender,age,count,crash_time,crash_location,latitude,:@computed_region_5r79_s8s6
0,193903,VSP - B2 Royalton,2017-03-19T00:00:00.000,Reading,2017,Sunday,Good,Driving too fast for conditions,Driver,Fatality (K),No,Male,20.7,1,NaT,,,
1,193577,VSP - B4 Rutland,2017-03-18T00:00:00.000,Killington,2017,Saturday,Good,Exceeded authorized speed limit,Passenger,Fatality (K),No,Male,30.0,1,2022-04-23 01:00:00,,,
2,194346,VSP - A5 Derby,2017-02-25T00:00:00.000,Barton,2017,Saturday,Good,Failure to keep in proper lane,Driver,Fatality (K),Yes,Male,73.4,1,2022-04-23 15:00:00,MinCollector,"{'latitude': '44.81912', 'longitude': '-72.184...",3215.0
3,192599,VSP - B2 Royalton,2017-02-16T00:00:00.000,Hartland,2017,Thursday,Slick,Driving too fast for conditions,Driver,Fatality (K),Yes,Male,50.0,1,2022-04-23 06:00:00,I-91,"{'latitude': '43.59563', 'longitude': '-72.347...",3218.0
4,192095,VSP - A2 St. Albans,2017-02-04T00:00:00.000,Georgia,2017,Saturday,Good,Operating vehicle in erratic- reckless- carele...,Driver,Fatality (K),No,Male,35.2,1,2022-04-23 16:00:00,I-89,,


In [40]:
df_traffic_fatalities['role'].value_counts()

Driver        208
Passenger      60
Pedestrian     15
Bicycle         4
Name: role, dtype: int64

In [14]:
# how many rows
df3.shape

(1000, 12)

In [5]:
# list out the columns
list(df)

['incident_no',
 'agency',
 'crash_date',
 'crash_city',
 'crash_day',
 'year',
 'conditions',
 'crash_circumstances',
 'role',
 'crash_injury',
 'seatbelt',
 'gender',
 'age',
 'count',
 'crash_time',
 'crash_location',
 'latitude',
 ':@computed_region_5r79_s8s6']

In [15]:
# show first few rows
df3.head(10)

Unnamed: 0,incident_id,incident_date_time,agency,incident_type,town_of_incident,county_of_incident,county_of_arrest,arrest_statutes,arresting_agency,arrestee_gender,arrestee_age,count
0,16A303786,2016-09-03 19:51:00,VSP: Middlesex,Accident - Injury|Drive/Operate Under Influence,East Montpelier,Washington,Washington,23V1201,VSP: Middlesex,Male,57,1
1,16A104400,2016-08-26 18:27:00,VSP: Williston,Accident - Property|Drive/Operate Under Influence,Unknown,Unknown,Unknown,23V1201,VSP: Williston,Male,34,1
2,16A303608,2016-08-23 23:26:00,VSP: Middlesex,Drive/Operate Under Influence|Accident - Property,Middlesex,Washington,Washington,23V1201,VSP: Middlesex,Female,27,1
3,16C104611,2016-08-21 23:46:00,VSP: Rutland,Accident - Property|Drive/Operate Under Influe...,Rutland Town,Rutland,Rutland,23V1201,VSP: Rutland,Male,56,1
4,16C104560,2016-08-19 02:26:00,VSP: Rutland,Accident - Property|Drive/Operate Under Influence,Clarendon,Rutland,Rutland,23V1201,VSP: Rutland,Female,22,1
5,16D303185,2016-08-15 06:50:00,VSP: Royalton,Drive/Operate Under Influence|DLS|Violate Cond...,Strafford,Orange,Orange,23V1201A1#3,VSP: Royalton,Male,39,1
6,16D303185,2016-08-15 06:50:00,VSP: Royalton,Drive/Operate Under Influence|DLS|Violate Cond...,Strafford,Orange,Orange,23V1201A1#3,VSP: Royalton,Male,39,1
7,16B103026,2016-08-14 19:20:00,VSP: Saint Johnsbury,Accident - Property|Accident - Property|Drive/...,Waterford,Caledonia,Caledonia,23V1201,VSP: Saint Johnsbury,Male,56,1
8,16C202664,2016-08-14 14:54:00,VSP: New Haven,Drive/Operate Under Influence|Accident - Property,Leicester,Addison,Addison,23V1201,VSP: New Haven,Male,28,1
9,16A303435,2016-08-14 01:30:00,VSP: Middlesex,Drive/Operate Under Influence|Accident - Property,Fayston,Washington,Washington,23V1201,VSP: Middlesex,Male,27,1


In [7]:
# How many instances 
df['crash_city'].value_counts()

Georgia             8
St. Johnsbury       8
Killington          7
Marshfield          6
Weathersfield       5
                   ..
Fletcher            1
Woodbury            1
South Burlington    1
Brookfield          1
Newbury             1
Name: crash_city, Length: 139, dtype: int64

In [12]:
# How many instances 
df2['county_of_incident'].value_counts()

Caledonia     135
Rutland       112
Windham       109
Orange        105
Addison        84
Bennington     83
Windsor        81
Franklin       73
Washington     63
Orleans        53
Chittenden     42
Lamoille       21
Unknown        20
Essex          14
Grand Isle      5
Name: county_of_incident, dtype: int64

In [17]:
df3['county_of_incident'].value_counts()

Windham       122
Orange        112
Rutland       104
Caledonia     101
Washington     90
Addison        82
Windsor        78
Bennington     68
Orleans        66
Franklin       60
Chittenden     49
Unknown        30
Lamoille       22
Essex          12
Grand Isle      4
Name: county_of_incident, dtype: int64

In [16]:
# How many instances of Churn
df['Churn?'].value_counts()

False.    2562
True.      436
Name: Churn?, dtype: int64

In [19]:
# drop columns not needed for training
df=df.drop(['Area Code','Phone'],axis=1)

In [35]:
# create a random train test ( 20 percent ) split
msk = np.random.rand(len(df)) < 0.20
train = df[~msk]
test = df[msk]

In [36]:
train.shape

(2664, 21)

In [37]:
test.shape

(669, 21)

In [12]:
# merge datasets
df_both = pd.concat([df,df2])

In [13]:
# num of rows
df_both.shape

(101766, 50)

In [14]:
# write dataframe to csv
df_both.to_csv('./sets/patient_re_admit.csv',index=False,sep=',')

In [15]:
df = df_both