# chi-square

In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

file_path = 'CleanedDataCoSupplyChainDataset.csv'
dataog = pd.read_csv(file_path)
print(dataog.head())

       type  days_for_shipping_(real)  days_for_shipment_(scheduled)  late?  \
0     DEBIT                         3                              4     -1   
1  TRANSFER                         5                              4      1   
2      CASH                         4                              4      0   
3     DEBIT                         3                              4     -1   
4   PAYMENT                         2                              4     -2   

    late  benefit_per_order  sales_per_customer   delivery_status  \
0  False          91.250000          314.640015  Advance shipping   
1   True         -79.700005          311.359985     Late delivery   
2  False         -79.700005          309.720001  Shipping on time   
3  False          22.860001          304.809998  Advance shipping   
4  False         134.210007          298.250000  Advance shipping   

   late_delivery_risk  category_id  ...      order_state     order_status  \
0                   0           7

In [38]:
# Filter data to include only rows where 'order_region' contains 'America'
datanew = dataog[dataog['order_region'].str.contains('America|USA', na=False)]

print(datanew[['customer_state', 'order_country']].head()) 
#customer state is where the order is coming from = store location
#order country is where the order is going to

   customer_state   order_country
55             PR  Estados Unidos
56             PR  Estados Unidos
57             PR  Estados Unidos
74             PR       Guatemala
75             PR     El Salvador


In [41]:
datanew.to_csv('datanew2.csv', index=False)

In [42]:
print(datanew.head())

       type  days_for_shipping_(real)  days_for_shipment_(scheduled)  late?  \
55  PAYMENT                         2                              2      0   
56  PAYMENT                         5                              2      3   
57  PAYMENT                         6                              2      4   
74  PAYMENT                         6                              2      4   
75  PAYMENT                         5                              2      3   

     late  benefit_per_order  sales_per_customer   delivery_status  \
55  False          22.410000           74.680000  Shipping on time   
56   True          25.240000           90.150002     Late delivery   
57   True          30.570000          117.580002     Late delivery   
74   True         101.389999          299.970001     Late delivery   
75   True         -79.700005          178.169998     Late delivery   

    late_delivery_risk  category_id  ...   order_state     order_status  \
55                   0       

In [44]:
unique_customer_states = datanew['customer_state'].unique()
print(unique_customer_states)

['PR' 'MD' 'CA' 'NY' 'OH' 'DC' 'FL' 'HI' 'TX' 'MA' 'PA' 'TN' 'NJ' 'WV'
 'IL' 'GA' 'MN' 'OR' 'MI' 'IN' 'NC' 'MO' 'VA' 'CT' 'LA' 'AZ' 'NV' 'KY'
 'CO' 'WI' 'UT' 'SC' 'ND' 'AR' 'RI' 'NM' 'OK' 'ID' 'WA' 'KS' 'DE' 'MT'
 'IA' 'AL']


In [45]:
datanew['month'] = pd.to_datetime(datanew['order_date_(dateorders)']).dt.month

variables = ['type', 'category_name', 'order_item_discount', 'shipping_mode', 'customer_country', 
            'customer_segment', 'order_country', 'order_region','market', 'product_name','late_delivery_risk','order_item_quantity', 'department_name', 'customer_state', 'order_item_discount_rate','month']

results = []
for var in variables:
    try:
        contingency_table = pd.crosstab(datanew[var], datanew['late'])
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        
        results.append({
            'Variable': var,
            'Chi-Square Statistic': chi2,
            'P-Value': p,
            'Reject Null Hypothesis': 'Yes' if p < 0.05 else 'No'
        })
    except ValueError as e:
        results.append({
            'Variable': var,
            'Chi-Square Statistic': 'Error',
            'P-Value': 'Error',
            'Reject Null Hypothesis': f"Error: {e}"
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort by Chi-Square Statistic to identify the most impactful variables
results_df = results_df.sort_values(by='Chi-Square Statistic', ascending=False)

print("Chi-Squared Test Results:")
print("The variables below are tested for their association with Late_delivery_risk:")
results_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datanew['month'] = pd.to_datetime(datanew['order_date_(dateorders)']).dt.month


Chi-Squared Test Results:
The variables below are tested for their association with Late_delivery_risk:


Unnamed: 0,Variable,Chi-Square Statistic,P-Value,Reject Null Hypothesis
10,late_delivery_risk,56028.175608,0.0,Yes
3,shipping_mode,14009.529154,0.0,Yes
2,order_item_discount,591.284256,0.6365444,No
6,order_country,109.495841,5.811568e-14,Yes
9,product_name,88.464976,0.4063566,No
13,customer_state,85.459426,0.0001248748,Yes
1,category_name,28.996005,0.5178066,No
14,order_item_discount_rate,26.868588,0.06002342,No
15,month,10.960957,0.1403311,No
0,type,9.143968,0.02743675,Yes


# linear regression

In [73]:
import pandas as pd
from scipy.stats import chi2_contingency

file_path = 'CleanedDataCoSupplyChainDataset.csv'
dataog = pd.read_csv(file_path)
datanew1 = dataog[dataog['order_region'].str.contains('America|USA', na=False)]
datanew1['month'] = pd.to_datetime(datanew1['order_date_(dateorders)']).dt.month
print(datanew1.head())

       type  days_for_shipping_(real)  days_for_shipment_(scheduled)  late?  \
55  PAYMENT                         2                              2      0   
56  PAYMENT                         5                              2      3   
57  PAYMENT                         6                              2      4   
74  PAYMENT                         6                              2      4   
75  PAYMENT                         5                              2      3   

     late  benefit_per_order  sales_per_customer   delivery_status  \
55  False          22.410000           74.680000  Shipping on time   
56   True          25.240000           90.150002     Late delivery   
57   True          30.570000          117.580002     Late delivery   
74   True         101.389999          299.970001     Late delivery   
75   True         -79.700005          178.169998     Late delivery   

    late_delivery_risk  category_id  ...     order_status order_zipcode  \
55                   0       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datanew1['month'] = pd.to_datetime(datanew1['order_date_(dateorders)']).dt.month


In [74]:
datanew1.columns.tolist()

['type',
 'days_for_shipping_(real)',
 'days_for_shipment_(scheduled)',
 'late?',
 'late',
 'benefit_per_order',
 'sales_per_customer',
 'delivery_status',
 'late_delivery_risk',
 'category_id',
 'category_name',
 'customer_city',
 'customer_country',
 'customer_email',
 'customer_fname',
 'customer_id',
 'customer_lname',
 'customer_password',
 'customer_segment',
 'customer_state',
 'customer_street',
 'customer_zipcode',
 'department_id',
 'department_name',
 'latitude',
 'longitude',
 'market',
 'order_city',
 'order_country',
 'order_customer_id',
 'order_date_(dateorders)',
 'order_id',
 'order_item_cardprod_id',
 'order_item_discount',
 'order_item_discount_rate',
 'order_item_id',
 'order_item_product_price',
 'order_item_profit_ratio',
 'order_item_quantity',
 'sales',
 'order_item_total',
 'order_profit_per_order',
 'order_region',
 'order_state',
 'order_status',
 'order_zipcode',
 'product_card_id',
 'product_category_id',
 'product_image',
 'product_name',
 'product_price'

In [82]:
# Defining the dependent and independent variables
dependent_var = 'days_for_shipping_(real)'
categorical_vars = ['type', 'category_name', 'shipping_mode', 'customer_country', 'customer_segment','order_country', 'order_region', 'market', 'product_name', 'department_name', 'customer_state']

# Filter only the required columns from the dataset
data_filtered = datanew1[[dependent_var] + categorical_vars + ['late_delivery_risk'] + ['order_item_discount'] + ['order_item_quantity']+['month']].copy()
print(data_filtered.head())

    days_for_shipping_(real)     type     category_name shipping_mode  \
55                         2  PAYMENT       Electronics  Second Class   
56                         5  PAYMENT      Boxing & MMA  Second Class   
57                         6  PAYMENT            Cleats  Second Class   
74                         6  PAYMENT  Cardio Equipment  Second Class   
75                         5  PAYMENT            Cleats  Second Class   

   customer_country customer_segment   order_country     order_region market  \
55      Puerto Rico      Home Office  Estados Unidos     West of USA    USCA   
56      Puerto Rico      Home Office  Estados Unidos     West of USA    USCA   
57      Puerto Rico      Home Office  Estados Unidos     West of USA    USCA   
74      Puerto Rico      Home Office       Guatemala  Central America  LATAM   
75      Puerto Rico      Home Office     El Salvador  Central America  LATAM   

                                     product_name department_name  \
55     Unde

In [84]:
ndata = pd.get_dummies(data_filtered, columns=categorical_vars, drop_first=True)
print(ndata.head())


    days_for_shipping_(real)  late_delivery_risk  order_item_discount  \
55                         2                   0            15.300000   
56                         5                   1            19.790001   
57                         6                   1             2.400000   
74                         6                   1             0.000000   
75                         5                   1             1.800000   

    order_item_quantity  month  type_DEBIT  type_PAYMENT  type_TRANSFER  \
55                    2      6       False          True          False   
56                    2      8       False          True          False   
57                    2      4       False          True          False   
74                    3      3       False          True          False   
75                    3      4       False          True          False   

    category_name_As Seen on  TV!  category_name_Baseball & Softball  ...  \
55                          False

In [86]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats


X = ndata.drop(columns=['days_for_shipping_(real)'])
y = ndata['days_for_shipping_(real)']

# Fill missing values with the mean
y = y.fillna(y.mean())

# Fit a linear regression model (no train-test split required)
model = LinearRegression()
model.fit(X, y)
coefficients = model.coef_

# Calculate residuals and standard errors
y_pred = model.predict(X)
residuals = y - y_pred
standard_errors = np.std(residuals) / np.sqrt(np.sum((X - X.mean()) ** 2, axis=0))

# Calculate t-values and p-values
t_values = coefficients / standard_errors
p_values = [2 * (1 - stats.t.cdf(np.abs(t), df=len(X) - 2)) for t in t_values]

# Create a results dataframe
results = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': coefficients,
    'P-Value': p_values,
    'Significant (P < 0.05)': ['Yes' if p < 0.05 else 'No' for p in p_values]
})

# Add intercept information
intercept_row = pd.DataFrame({
    'Variable': ['Intercept'],
    'Coefficient': [model.intercept_],
    'P-Value': [np.nan],
    'Significant (P < 0.05)': [np.nan]
})

results = pd.concat([intercept_row, results], ignore_index=True)

results


Unnamed: 0,Variable,Coefficient,P-Value,Significant (P < 0.05)
0,Intercept,-8.313389e+09,,
1,late_delivery_risk,2.225078e+00,0.000000e+00,Yes
2,order_item_discount,8.956632e-05,6.172752e-01,No
3,order_item_quantity,4.912161e-04,8.227187e-01,No
4,month,2.656617e-03,1.186851e-01,No
...,...,...,...,...
199,customer_state_UT,-3.356595e-01,0.000000e+00,Yes
200,customer_state_VA,-2.332482e-01,5.258016e-13,Yes
201,customer_state_WA,-3.850181e-01,0.000000e+00,Yes
202,customer_state_WI,-3.293873e-01,7.704948e-14,Yes


In [92]:
significant_vars = results[results['Significant (P < 0.05)'] == 'Yes']
print("Significant Variables:")
significant_vars

Significant Variables:


Unnamed: 0,Variable,Coefficient,P-Value,Significant (P < 0.05)
1,late_delivery_risk,2.225078e+00,0.000000e+00,Yes
5,type_DEBIT,-2.562836e-02,1.087683e-04,Yes
6,type_PAYMENT,-4.380391e-02,1.319873e-08,Yes
7,type_TRANSFER,1.740229e-01,0.000000e+00,Yes
8,category_name_As Seen on TV!,-4.488173e+09,0.000000e+00,Yes
...,...,...,...,...
199,customer_state_UT,-3.356595e-01,0.000000e+00,Yes
200,customer_state_VA,-2.332482e-01,5.258016e-13,Yes
201,customer_state_WA,-3.850181e-01,0.000000e+00,Yes
202,customer_state_WI,-3.293873e-01,7.704948e-14,Yes


Logistic regression

In [3]:
import pandas as pd
import numpy as np

fp = pd.read_csv('datanew2.csv')

unique_order_countries = fp['order_country'].unique()
unique_order_countries_list = sorted(unique_order_countries.tolist())
print(unique_order_countries_list)



['Argentina', 'Belice', 'Bolivia', 'Brasil', 'Chile', 'Colombia', 'Costa Rica', 'Ecuador', 'El Salvador', 'Estados Unidos', 'Guatemala', 'Guayana Francesa', 'Guyana', 'Honduras', 'México', 'Nicaragua', 'Panamá', 'Paraguay', 'Perú', 'Surinam', 'Uruguay', 'Venezuela']


In [4]:
unique_order_countries_by_region = fp.groupby('order_region')['order_country'].unique().apply(list).to_dict()
for region in unique_order_countries_by_region:
    unique_order_countries_by_region[region] = sorted(unique_order_countries_by_region[region])
print(unique_order_countries_by_region)

{'Central America': ['Belice', 'Costa Rica', 'El Salvador', 'Guatemala', 'Honduras', 'México', 'Nicaragua', 'Panamá'], 'East of USA': ['Estados Unidos'], 'South America': ['Argentina', 'Bolivia', 'Brasil', 'Chile', 'Colombia', 'Ecuador', 'Guayana Francesa', 'Guyana', 'Paraguay', 'Perú', 'Surinam', 'Uruguay', 'Venezuela'], 'South of  USA ': ['Estados Unidos'], 'West of USA ': ['Estados Unidos']}


In [5]:
unique_order_cities_by_region = fp.groupby('order_region')['order_city'].unique().apply(list).to_dict()
for region in unique_order_cities_by_region:
    unique_order_cities_by_region[region] = sorted(unique_order_cities_by_region[region])
print(unique_order_cities_by_region)

{'Central America': ['Acayucan', 'Acuña', 'Acámbaro', 'Altotonga', 'Amatitlán', 'Antiguo Cuscatlán', 'Apatzingán de la Constitución', 'Apodaca', 'Apopa', 'Arraiján', 'Atlixco', 'Azcapotzalco', 'Cadereyta', 'Campeche', 'Cancún', 'Celaya', 'Chetumal', 'Chihuahua', 'Chilpancingo', 'Chimaltenango', 'Chinandega', 'Chinautla', 'Choloma', 'Cholula', 'Choluteca', 'Ciudad del Carmen', 'Coacalco', 'Coatzacoalcos', 'Colima', 'Colón', 'Coyoacán', 'Cuajimalpa', 'Cuautitlán', 'Cuernavaca', 'Culiacán', 'Cuscatancingo', 'Córdoba', 'David', 'Delgado', 'Delicias', 'Durango', 'El Progreso', 'Ensenada', 'Escuintla', 'Estelí', 'Fresnillo de González Echeverría', 'Frontera', 'Garza García', 'General Escobedo', 'Granada', 'Guadalajara', 'Guamúchil', 'Guanajuato', 'Guasave', 'Guatemala City', 'Guaymas', 'Guzmán', 'Gómez Palacio', 'Hermosillo', 'Heroica Zitácuaro', 'Hidalgo', 'Huehuetenango', 'Huixquilucan', 'Iguala', 'Ilopango', 'Irapuato', 'Ixtapaluca', 'Jiutepec', 'Juárez', 'La Ceiba', 'La Chorrera', 'La Pa

In [6]:
unique_order_cities_by_country = fp.groupby('order_country')['order_city'].unique().apply(list).to_dict()
for country in unique_order_cities_by_country:
    unique_order_cities_by_country[country] = sorted(unique_order_cities_by_country[country])
print(unique_order_cities_by_country)

{'Argentina': ['Avellaneda', 'Bahía Blanca', 'Buenos Aires', 'Cipolletti', 'Comodoro Rivadavia', 'Concepción del Uruguay', 'Corrientes', 'Córdoba', 'Esquina', 'Junín', 'La Plata', 'La Rioja', 'Mar del Plata', 'Mendoza', 'Neuquén', 'Paraná', 'Posadas', 'Presidencia Roque Sáenz Peña', 'Quilmes', 'Resistencia', 'Rosario', 'Salta', 'San Fernando del Valle de Catamarca', 'San Juan', 'San Justo', 'San Luis', 'San Nicolás de los Arroyos', 'San Rafael', 'San Salvador de Jujuy', 'Santa Fe', 'Santa Rosa', 'Santiago del Estero', 'Tartagal'], 'Belice': ['San Ignacio'], 'Bolivia': ['Cochabamba', 'La Paz', 'Oruro', 'Potosí', 'Riberalta', 'Santa Cruz de la Sierra', 'Sucre', 'Trinidad', 'Yacuiba'], 'Brasil': ['Abreu e Lima', 'Altamira', 'Americana', 'Ananindeua', 'Andradina', 'Anápolis', 'Apucarana', 'Aracaju', 'Aracati', 'Araguaína', 'Arapiraca', 'Arapongas', 'Araranguá', 'Araraquara', 'Araucária', 'Araçatuba', 'Arcoverde', 'Ariquemes', 'Açu', 'Bagé', 'Balneário Camboriú', 'Barbacena', 'Barra Mansa',

In [None]:
unique_cities = fp['order_city'].unique()
unique_cities_list = sorted(unique_cities.tolist())
print(unique_cities_list)