In [262]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

pd.set_option('max_column', None)
pumpkins = pd.read_csv('../data/US-pumpkins.csv')

In [263]:
pumpkins.shape

(1757, 26)

In [264]:
print(pumpkins.columns.values)

['City Name' 'Type' 'Package' 'Variety' 'Sub Variety' 'Grade' 'Date'
 'Low Price' 'High Price' 'Mostly Low' 'Mostly High' 'Origin'
 'Origin District' 'Item Size' 'Color' 'Environment' 'Unit of Sale'
 'Quality' 'Condition' 'Appearance' 'Storage' 'Crop' 'Repack' 'Trans Mode'
 'Unnamed: 24' 'Unnamed: 25']


In [265]:
pumpkins.head(5)

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Grade,Date,Low Price,High Price,Mostly Low,Mostly High,Origin,Origin District,Item Size,Color,Environment,Unit of Sale,Quality,Condition,Appearance,Storage,Crop,Repack,Trans Mode,Unnamed: 24,Unnamed: 25
0,BALTIMORE,,24 inch bins,,,,4/29/17,270.0,280.0,270.0,280.0,MARYLAND,,lge,,,,,,,,,E,,,
1,BALTIMORE,,24 inch bins,,,,5/6/17,270.0,280.0,270.0,280.0,MARYLAND,,lge,,,,,,,,,E,,,
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,160.0,DELAWARE,,med,ORANGE,,,,,,,,N,,,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,9/24/16,160.0,160.0,160.0,160.0,VIRGINIA,,med,ORANGE,,,,,,,,N,,,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,,11/5/16,90.0,100.0,90.0,100.0,MARYLAND,,lge,ORANGE,,,,,,,,N,,,


In [266]:
pumpkins.isna().sum()

City Name             0
Type               1712
Package               0
Variety               5
Sub Variety        1461
Grade              1757
Date                  0
Low Price             0
High Price            0
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Repack                0
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64

In [267]:
pumpkins = pumpkins[pumpkins['Color'].notna()]
pumpkins.dropna(how='all', axis=1, inplace=True)
pumpkins.head(5)

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Date,Low Price,High Price,Mostly Low,Mostly High,Origin,Origin District,Item Size,Color,Unit of Sale,Repack,Unnamed: 25
2,BALTIMORE,,24 inch bins,HOWDEN TYPE,,9/24/16,160.0,160.0,160.0,160.0,DELAWARE,,med,ORANGE,,N,
3,BALTIMORE,,24 inch bins,HOWDEN TYPE,,9/24/16,160.0,160.0,160.0,160.0,VIRGINIA,,med,ORANGE,,N,
4,BALTIMORE,,24 inch bins,HOWDEN TYPE,,11/5/16,90.0,100.0,90.0,100.0,MARYLAND,,lge,ORANGE,,N,
5,BALTIMORE,,24 inch bins,HOWDEN TYPE,,11/12/16,90.0,100.0,90.0,100.0,MARYLAND,,lge,ORANGE,,N,
6,BALTIMORE,,36 inch bins,HOWDEN TYPE,,9/24/16,160.0,170.0,160.0,170.0,MARYLAND,,med,ORANGE,,N,


In [268]:
pumpkins.rename( columns={'Unnamed: 25':'some column'}, inplace=True )

In [269]:
pumpkins.shape

(1141, 17)

In [270]:
pumpkins.isna().sum()

City Name             0
Type               1140
Package               0
Variety               0
Sub Variety         909
Date                  0
Low Price             0
High Price            0
Mostly Low           89
Mostly High          89
Origin                0
Origin District    1035
Item Size           150
Color                 0
Unit of Sale       1017
Repack                0
some column        1052
dtype: int64

In [271]:
for col in pumpkins:
    print(col, ": ", pumpkins[col].unique())

City Name :  ['BALTIMORE' 'ATLANTA' 'BOSTON' 'CHICAGO' 'COLUMBIA' 'LOS ANGELES'
 'NEW YORK' 'DETROIT' 'DALLAS' 'MIAMI' 'SAN FRANCISCO' 'PHILADELPHIA'
 'ST. LOUIS']
Type :  [nan 'Organic']
Package :  ['24 inch bins' '36 inch bins' '1 1/9 bushel cartons' '1/2 bushel cartons'
 '1 1/9 bushel crates' 'bushel cartons' 'bins' '35 lb cartons'
 '50 lb sacks' '40 lb cartons' 'bushel baskets' '22 lb cartons']
Variety :  ['HOWDEN TYPE' 'PIE TYPE' 'BIG MACK TYPE' 'MINIATURE' 'CINDERELLA'
 'FAIRYTALE' 'BLUE TYPE' 'KNUCKLE HEAD' 'HOWDEN WHITE TYPE']
Sub Variety :  [nan 'FLAT TYPE' 'ROUND TYPE']
Date :  ['9/24/16' '11/5/16' '11/12/16' '10/1/16' '10/8/16' '10/15/16' '10/22/16'
 '10/29/16' '9/16/17' '9/23/17' '9/30/17' '11/19/16' '11/26/16' '12/3/16'
 '12/10/16' '9/2/17' '9/9/17' '11/29/14' '9/26/16' '9/27/16' '9/28/16'
 '9/29/16' '9/30/16']
Low Price :  [160.    90.   150.   140.   100.   130.   120.    15.    18.    17.
  16.   200.   190.    50.    40.   145.   180.   170.   195.   175.
 260.   135. 

In [272]:
X = pumpkins.loc[ : , pumpkins.columns != 'Color']
y = pumpkins['Color']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [273]:
X_train.head()

Unnamed: 0,City Name,Type,Package,Variety,Sub Variety,Date,Low Price,High Price,Mostly Low,Mostly High,Origin,Origin District,Item Size,Unit of Sale,Repack,some column
406,BOSTON,,24 inch bins,PIE TYPE,,10/8/16,200.0,200.0,200.0,200.0,MICHIGAN,,sml,,N,
390,BOSTON,,1 1/9 bushel cartons,PIE TYPE,,10/8/16,15.0,16.0,15.0,15.0,MASSACHUSETTS,,sml,,N,
1462,SAN FRANCISCO,,36 inch bins,HOWDEN TYPE,,9/23/17,120.0,120.0,120.0,120.0,CALIFORNIA,,med,,N,
1036,COLUMBIA,,1/2 bushel cartons,MINIATURE,FLAT TYPE,10/29/16,18.0,18.0,18.0,18.0,PENNSYLVANIA,,sml,,N,
608,CHICAGO,,36 inch bins,HOWDEN WHITE TYPE,,10/15/16,150.0,150.0,150.0,150.0,ILLINOIS,,lge,,N,


In [274]:
categorical_features = ['City Name', 'Type', 'Package', 'Variety', 'Sub Variety', 'Date', 'Origin', 'Origin District', 'Item Size', 'Unit of Sale', 'Repack', 'some column']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', LogisticRegression(solver="liblinear"))])

pipeline.fit(X_train, y_train)


ValueError: y should be a 1d array, got an array of shape (912, 4) instead.

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

      ORANGE       0.88      0.97      0.92       180
     STRIPED       0.00      0.00      0.00         3
       WHITE       0.81      0.54      0.65        46

    accuracy                           0.87       229
   macro avg       0.56      0.50      0.52       229
weighted avg       0.85      0.87      0.85       229

Predicted labels:  ['ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'WHITE' 'ORANGE' 'ORANGE'
 'ORANGE' 'WHITE' 'WHITE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANGE' 'WHITE' 'ORANGE' 'WHITE' 'WHITE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANGE' 'WHITE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'WHITE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANGE' 'ORANGE' 'WHITE' 'ORANGE' 'ORANGE' 'ORANGE'
 'ORANGE' 'ORANGE' 'ORANG

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[174,   0,   6],
       [  3,   0,   0],
       [ 21,   0,  25]])