In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
original_df = pd.read_csv(r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\train.csv')

In [3]:
original_df.columns

Index(['TripType', 'VisitNumber', 'Weekday', 'Upc', 'ScanCount',
       'DepartmentDescription', 'FinelineNumber'],
      dtype='object')

In [4]:
original_df.describe()

Unnamed: 0,TripType,VisitNumber,Upc,ScanCount,FinelineNumber
count,453411.0,453411.0,450559.0,453411.0,450559.0
mean,58.027039,95909.846115,30752430000.0,1.108584,3727.366554
std,155.97337,55399.801147,91639400000.0,0.707029,2779.958546
min,3.0,5.0,834.0,-10.0,0.0
25%,27.0,49390.0,3400004000.0,1.0,1404.0
50%,39.0,96781.0,7056082000.0,1.0,3352.0
75%,40.0,143930.0,30132010000.0,1.0,5501.0
max,999.0,191347.0,978970700000.0,71.0,9998.0


In [5]:
df = original_df.drop(["Upc", "FinelineNumber", "TripType"], axis=1)

In [6]:
df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

In [7]:
df

Unnamed: 0,VisitNumber,Weekday,ScanCount,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,Friday,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,Friday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,Friday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,Friday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10,Friday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453406,191344,Sunday,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
453407,191344,Sunday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
453408,191344,Sunday,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
453409,191347,Sunday,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()

In [9]:
df

Unnamed: 0,VisitNumber,Weekday,ScanCount,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,Friday,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,Friday,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10,Friday,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11,Friday,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,Friday,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67024,191329,Sunday,20,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
67025,191337,Sunday,27,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67026,191343,Sunday,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67027,191344,Sunday,5,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [11]:
X, y, XX, yy = transform_data(r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\train.csv',r'C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\test.csv')

In [12]:

# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [13]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [14]:

# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
#from sklearn.model_selection import train_test_split
#X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.32, random_state=42)

In [50]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Fijamos la semilla aleatoria, para que la división sea siempre repetible
modelo_SGDC = make_pipeline(StandardScaler(), SGDClassifier(random_state=42, loss="log"))

# Entrenamos el modelo
modelo_SGDC.fit(X_train, y_train)

# Obtenemos los valores predichos
y_test_pred = modelo_SGDC.predict(X_test)
y_train_pred = modelo_SGDC.predict(X_train)

SyntaxError: invalid syntax (<ipython-input-47-d97e4fffbc0c>, line 15)

In [24]:
#from sklearn.linear_model import SGDClassifier
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler

# Fijamos la semilla aleatoria, para que la división sea siempre repetible
#modelo_SGDC = make_pipeline(StandardScaler(), SGDClassifier(random_state=42))

# Entrenamos el modelo
#modelo_SGDC.fit(X_train, y_train)

# Obtenemos los valores predichos
#y_test_pred2 = modelo_SGDC.predict(X_test2)
#y_train_pred2 = modelo_SGDC.predict(X_train2)

## Primera variante

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluación sobre el conjunto de Test
print("TEST REPORT")
print("Accuracy:",accuracy_score(y_test, y_test_pred))

TEST REPORT
Accuracy: 0.6973327305605787


In [52]:
# Evaluación sobre el conjunto de Train
print("TRAIN REPORT")
print("Accuracy:",accuracy_score(y_train, y_train_pred))

TRAIN REPORT
Accuracy: 0.7034670110668241


## Segunda variante

In [19]:
# Evaluación sobre el conjunto de Test
#print("TEST REPORT")
#print("Accuracy:",accuracy_score(y_test2, y_test_pred2))

In [20]:
# Evaluación sobre el conjunto de Train
#print("TRAIN REPORT")
#print("Accuracy:",accuracy_score(y_train2, y_train_pred2))

In [53]:
X.shape, XX.shape

((67029, 80), (28645, 80))

In [54]:
yy = modelo_SGDC.predict(XX)
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])
submission.to_csv(r"C:\Users\luqqa\OneDrive\Escritorio\DiploDatos\Practico AS\submissionSGDC2.csv", header=True, index=False)
