# TPS 12 - Comparison of Original and Synthetic Data

In this notebook, we look at the synthetic data provided with the competition and compare it with the original competition data from the Forest Cover Type competition as well as the original data from the UCI ML repository.

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 5

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import time
import os
import pyarrow
import gc

# Model evaluation
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, mean_squared_error

# Models
import xgboost as xgb
from xgboost import XGBClassifier


# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#pd.set_option('float_format', '{:f}'.format)

# Load Data

In [3]:
# Print the paths to all of the original files
for dirname, _, filenames in os.walk('..\data'):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))

..\data\original_test.csv
..\data\original_train.csv
..\data\sample_submission.csv
..\data\test.csv
..\data\train.csv


In [4]:
%%time

# Original data and synthetic TPS data
original = pd.read_feather('../data/original.feather')
train = pd.read_feather('../data/train.feather')
test = pd.read_feather('../data/test.feather')
submission = pd.read_csv('../data/sample_submission.csv')

# Get feature columns
features = [x for x in train.columns if x not in ['Id','Cover_Type']]
wilderness_cols = [x for x in features if x.startswith('Wild')]
binary_cols = [x for x in features if x.startswith('Soil') or x in wilderness_cols]
numerical_cols = [x for x in features if x not in binary_cols]

gc.collect()

print(f'Original: {original.shape[0]} rows, {original.shape[1]} cols')
print(f'Train: {train.shape[0]} rows, {train.shape[1]} cols')
print(f'Test: {test.shape[0]} rows, {test.shape[1]} cols')

Original: 581012 rows, 55 cols
Train: 4000000 rows, 56 cols
Test: 1000000 rows, 55 cols
Wall time: 363 ms


# Compare Numerical Variables

In [5]:
def check_numerical(data):
    temp = data.describe().T.drop(columns = 'count')
    temp[['std']] = temp[['std']].round(1)
    temp[['mean','min','25%','50%','75%','max']] = temp[['mean','min','25%','50%','75%','max']].astype('int32')
    return temp
    

In [6]:
# Original
check_numerical(original[numerical_cols])

Unnamed: 0,mean,std,min,25%,50%,75%,max
Elevation,2959,280.0,1859,2809,2996,3163,3858
Aspect,155,111.9,0,58,127,260,360
Slope,14,7.5,0,9,13,18,66
Horizontal_Distance_To_Hydrology,269,212.5,0,108,218,384,1397
Vertical_Distance_To_Hydrology,46,58.3,-173,7,30,69,601
Horizontal_Distance_To_Roadways,2350,1559.3,0,1106,1997,3328,7117
Hillshade_9am,212,26.8,0,198,218,231,254
Hillshade_Noon,223,19.8,0,213,226,237,254
Hillshade_3pm,142,38.3,0,119,143,168,254
Horizontal_Distance_To_Fire_Points,1980,1324.2,0,1024,1710,2550,7173


In [7]:
# New Data
check_numerical(train[numerical_cols])

Unnamed: 0,mean,std,min,25%,50%,75%,max
Elevation,2980,289.0,1773,2760,2966,3217,4383
Aspect,151,110.0,-33,60,123,247,407
Slope,15,8.5,-3,9,14,20,64
Horizontal_Distance_To_Hydrology,271,226.5,-92,110,213,361,1602
Vertical_Distance_To_Hydrology,51,68.2,-317,4,31,78,647
Horizontal_Distance_To_Roadways,1766,1315.6,-287,822,1436,2365,7666
Hillshade_9am,211,30.8,-4,198,218,233,301
Hillshade_Noon,221,22.2,49,210,224,237,279
Hillshade_3pm,140,43.7,-53,115,142,169,272
Horizontal_Distance_To_Fire_Points,1581,1127.6,-277,781,1361,2084,8075


# Wilderness Areas

In [8]:
def counts_per_thousand(columns):
    temp = original[columns].sum(axis = 0) / original.shape[0] * 1000
    temp = pd.DataFrame(data = temp.round(2))
    temp.columns = ['Original']
    temp['Synthetic'] = train[columns].sum(axis = 0) / train.shape[0] * 1000
    temp['Synthetic'] = temp['Synthetic'].round(2)
    return temp

In [9]:
counts_per_thousand(wilderness_cols)

Unnamed: 0,Original,Synthetic
Wilderness_Area1,448.87,261.19
Wilderness_Area2,51.43,41.66
Wilderness_Area3,436.07,653.57
Wilderness_Area4,63.63,21.82


# Soil Types

In [10]:
counts_per_thousand([f'Soil_Type{i}' for i in range(1,11)])

Unnamed: 0,Original,Synthetic
Soil_Type1,5.22,16.84
Soil_Type2,12.95,30.9
Soil_Type3,8.3,4.28
Soil_Type4,21.34,37.91
Soil_Type5,2.75,15.72
Soil_Type6,11.32,7.97
Soil_Type7,0.18,0.0
Soil_Type8,0.31,2.9
Soil_Type9,1.97,10.89
Soil_Type10,56.17,54.54


In [11]:
counts_per_thousand([f'Soil_Type{i}' for i in range(11,21)])

Unnamed: 0,Original,Synthetic
Soil_Type11,21.36,27.99
Soil_Type12,51.58,18.29
Soil_Type13,30.0,31.3
Soil_Type14,1.03,14.98
Soil_Type15,0.01,0.0
Soil_Type16,4.9,15.89
Soil_Type17,5.89,20.67
Soil_Type18,3.27,13.44
Soil_Type19,6.92,13.81
Soil_Type20,15.94,17.37


In [12]:
counts_per_thousand([f'Soil_Type{i}' for i in range(21,31)])

Unnamed: 0,Original,Synthetic
Soil_Type21,1.44,11.54
Soil_Type22,57.44,31.35
Soil_Type23,99.4,49.17
Soil_Type24,36.62,25.02
Soil_Type25,0.82,3.26
Soil_Type26,4.46,13.53
Soil_Type27,1.87,11.77
Soil_Type28,1.63,10.71
Soil_Type29,198.36,22.27
Soil_Type30,51.93,28.87


In [13]:
counts_per_thousand([f'Soil_Type{i}' for i in range(31,41)])

Unnamed: 0,Original,Synthetic
Soil_Type31,44.17,27.49
Soil_Type32,90.39,37.46
Soil_Type33,77.72,37.82
Soil_Type34,2.77,12.0
Soil_Type35,3.25,16.05
Soil_Type36,0.2,10.71
Soil_Type37,0.51,12.21
Soil_Type38,26.8,40.75
Soil_Type39,23.76,39.24
Soil_Type40,15.06,31.62
