In [2]:
# Import here
import numpy as np
import pandas as pd
from scipy import stats

In [3]:
def preprocess(filename):
        
    print("Preprocessing %s" % (filename))
    df = pd.read_csv(filename)
    
    missing_keys = ["taxi_type", "months_of_activity", "customer_score", 
                    "customer_score_confidence", "anon_var_1"]

    mean_keys = ["months_of_activity", "customer_score", "anon_var_1"]

    str_keys = {"taxi_type": 5, "customer_score_confidence": 3, "drop_location_type": 14, "sex": 2}
    
    ids = []
    cols = []
    for key, values in df.items():
        
        rows = len(values)
        nan_vals = values.isna()
        print(key, values.dtypes, nan_vals.sum())

        column = []
        for i, val in enumerate(values):
            if nan_vals[i]:
                column.append(None)
            else:
                column.append(val)

        if key == "id":
            ids = column
            continue
        
        column_modified = column
        if nan_vals.sum() > 0:
            assert key in missing_keys, \
                    "Incorrect missing keys list"
            
            if key in mean_keys:
                column_mean = np.mean([v for v in column if v is not None])
                column_modified = [v if v is not None else column_mean for v in column]
            
            else:
                column_mode = stats.mode(np.array([v for v in column if v is not None])).mode[0]
                column_modified = [v if v is not None else column_mode for v in column]
        
        # This is not a string type
        if key not in str_keys.keys():
            cols.append(column_modified)
            continue
        
        # binary columns. Each column filled with 0 initially
        binary_cols = [[0] * rows for _ in range(str_keys[key])]
        
        # Handle multi-character strings first
        # Male -> (1, 0)
        # Female -> (0, 1)
        if key == "sex":
            for i, v in enumerate(column_modified):
                if v == "Male":
                    binary_cols[0][i] = 1
                else:
                    binary_cols[1][i] = 1
                    
        # Handle single character strings now
        else:
            for i, v in enumerate(column_modified):
                binary_cols[ord(v) - ord('A')][i] = 1
        
        # Add binary columns to all columns
        for binary_col in binary_cols:
            cols.append(binary_col)
    
    print()
    return (np.array(ids), np.column_stack(tuple(cols)))

In [4]:
train_ids, training_dataset = preprocess("train.csv")
test_ids, test_data = preprocess("test.csv")

training_data = training_dataset[:, :-1]
training_labels = training_dataset[:, -1]

Preprocessing train.csv
id object 0
distance float64 0
taxi_type object 12152




months_of_activity float64 3533
customer_score float64 12041
customer_score_confidence object 12041
drop_location_type object 0
ratings_given_by_cust float64 0
num_of_cancelled_trips int64 0
anon_var_1 float64 42419
anon_var_2 int64 0
anon_var_3 int64 0
sex object 0
pricing_category int64 0

Preprocessing test.csv
id object 0
distance float64 0
taxi_type object 8058
months_of_activity float64 2387
customer_score float64 8152
customer_score_confidence object 8152
drop_location_type object 0
ratings_given_by_cust float64 0
num_of_cancelled_trips int64 0
anon_var_1 float64 28611
anon_var_2 int64 0
anon_var_3 int64 0
sex object 0



In [36]:
# Add classifiers here
import csv
from sklearn import tree
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(training_data,training_labels)
prediction = clf.predict(test_data)
final = []
final.append(['id','pricing_category'])
for cnt,x in enumerate(prediction) :
    temp = []
    temp.append(test_ids[cnt])
    temp.append(float(x))
    final.append(temp)
with open('dec_tree.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(final)

In [37]:
import numpy as np
split_size = int(0.8*training_dataset.shape[0])
train = training_dataset[:split_size,:]
test = training_dataset[split_size+1:,:]
trainX = train[:,:-1]
trainY = train[:,-1:]
testX = test[:,:-1]
testY = test[:,-1:]
clf=clf.fit(trainX,trainY)
actual = []
for y in testY:
    actual.append(y[0])
prediction = clf.predict(testX)
actual = np.array(actual)
prediction = np.array(prediction)
acc = 0 
for cnt in range(actual.shape[0]):
    if(actual[cnt]==prediction[cnt]):
        acc = acc+1
print("Accuracy for Decision Tree: ",acc/actual.shape[0])

Accuracy for Decision Tree:  0.5677574530033547


In [38]:
for x in train_ids:
    if x in test_ids:
        print("Found")