In [1]:
# Import the required packages
import os
from sklearn import preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
original_df = pd.read_csv(r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\train.csv')

In [3]:

def clasificacionDias(dia):
    if dia in ['Monday','Tuesday','Wednesday','Thursday']:
        return 'low day'
    if dia in ['Friday']:
        return 'party'
    else:
        return'weekend'

In [4]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    #df.drop_duplicates(keep='first', ignore_index=True, inplace=True)

    # drop the columns we won't use (it may be good to use them somehow)
    #df = df.drop(["Upc"], axis=1)
    mask = df.Upc.isna()
    column_name = 'Upc'
    df.loc[mask, column_name] = 0

    #Reemplazamos nulos en FinelineNumber
    mask = (df.FinelineNumber.isna())&(df.DepartmentDescription=='PHARMACY RX')
    column_name = 'FinelineNumber'
    df.loc[mask, column_name] = 4822.0

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    #df = df.groupby(["VisitNumber", "Weekday","FinelineNumber"], as_index=False).sum()
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()

    df['Segmentacion Semana']=df.Weekday.apply(lambda x:clasificacionDias(x))
    df = pd.get_dummies(df, columns=["Segmentacion Semana"], dummy_na=True)

    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [5]:
X, y, XX, yy = transform_data(r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\train - copia.csv',r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\test - copia.csv')

In [6]:
X.shape

(67029, 85)

In [7]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.30, random_state=42)

In [8]:
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn import ensemble 


In [9]:
param_grid = {
    
    'criterion': ['gini', 'entropy'],
    'random_state'      : [2],
    'max_features': ['auto'],
    "n_estimators": [50, 100, 150]
    
}

In [11]:
from sklearn.model_selection import GridSearchCV



RFT = GridSearchCV(ensemble.RandomForestClassifier(),param_grid, cv=3, scoring='accuracy')
RFT.fit(X_train, y_train);
best_tree_RFT = RFT.best_estimator_
best_tree_RFT

RandomForestClassifier(n_estimators=150, random_state=2)

In [12]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [13]:
print('Best Decision Tree accuracy: ', RFT.best_score_)
print(RFT)
results = results.append({'clf': RFT, 'best_acc': RFT.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.6881074168797955
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto'],
                         'n_estimators': [50, 100, 150], 'random_state': [2]},
             scoring='accuracy')
The best classifier so far is: 
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto'],
                         'n_estimators': [50, 100, 150], 'random_state': [2]},
             scoring='accuracy')


In [14]:
yy = RFT.predict(XX)

In [16]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [17]:
submission.to_csv(r"C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\submissionRandomForestClassifier.csv", header=True, index=False)