In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
df = pd.read_csv("clean_data.csv")
df.head()

Unnamed: 0,year,month,stateDescription,sectorName,customers,price,revenue,sales
0,2001,1,Wyoming,all sectors,,4.31,48.1284,1116.17208
1,2001,1,Wyoming,commercial,,5.13,12.67978,247.08691
2,2001,1,Wyoming,industrial,,3.26,19.60858,602.30484
3,2001,1,Wyoming,other,,4.75,0.76868,16.17442
4,2001,1,Wyoming,residential,,6.01,15.07136,250.60591


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85870 entries, 0 to 85869
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              85870 non-null  int64  
 1   month             85870 non-null  int64  
 2   stateDescription  85870 non-null  object 
 3   sectorName        85870 non-null  object 
 4   customers         59830 non-null  float64
 5   price             85870 non-null  float64
 6   revenue           85870 non-null  float64
 7   sales             85870 non-null  float64
dtypes: float64(4), int64(2), object(2)
memory usage: 5.2+ MB


In [59]:
df.shape

(85870, 8)

In [60]:
df.describe()

Unnamed: 0,year,month,customers,price,revenue,sales
count,85870.0,85870.0,59830.0,85870.0,85870.0,85870.0
mean,2012.043321,6.480144,2916013.0,9.300193,586.627155,5980.04897
std,6.660304,3.461589,12005670.0,5.010382,2161.047702,21302.453181
min,2001.0,1.0,0.0,0.0,-1e-05,0.0
25%,2006.0,3.0,4998.0,6.65,29.475195,289.144572
50%,2012.0,6.0,299754.0,8.84,121.6415,1447.518085
75%,2018.0,9.0,2028716.0,11.38,421.320628,4339.950965
max,2024.0,12.0,162505000.0,116.67,52361.45097,391900.00897


In [61]:
df.nunique()

year                   24
month                  12
stateDescription       62
sectorName              6
customers           45038
price                2943
revenue             78693
sales               79292
dtype: int64

In [62]:
df.isnull().sum()

year                    0
month                   0
stateDescription        0
sectorName              0
customers           26040
price                   0
revenue                 0
sales                   0
dtype: int64

In [63]:
df.isna().sum() / df.shape[0]

year                0.000000
month               0.000000
stateDescription    0.000000
sectorName          0.000000
customers           0.303249
price               0.000000
revenue             0.000000
sales               0.000000
dtype: float64

In [64]:
df["customers"].fillna(df["customers"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["customers"].fillna(df["customers"].mean(), inplace=True)


In [65]:
df.head()

Unnamed: 0,year,month,stateDescription,sectorName,customers,price,revenue,sales
0,2001,1,Wyoming,all sectors,2916013.0,4.31,48.1284,1116.17208
1,2001,1,Wyoming,commercial,2916013.0,5.13,12.67978,247.08691
2,2001,1,Wyoming,industrial,2916013.0,3.26,19.60858,602.30484
3,2001,1,Wyoming,other,2916013.0,4.75,0.76868,16.17442
4,2001,1,Wyoming,residential,2916013.0,6.01,15.07136,250.60591


In [66]:
df.isna().sum() / df.shape[0]

year                0.0
month               0.0
stateDescription    0.0
sectorName          0.0
customers           0.0
price               0.0
revenue             0.0
sales               0.0
dtype: float64

In [67]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [68]:
categorical_features = ["stateDescription", "sectorName"]
numeric_features = ["year", "month", "customers", "sales"]

In [69]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ], remainder='passthrough')

In [70]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [71]:
model = Pipeline([
    ("preprocessor", preprocessor), 
    ("model", LinearRegression())
    ])


In [72]:
X = df.drop(["price", "revenue"], axis=1)
y = df["price"]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [74]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [75]:
model.score(X_train, y_train)

0.007135648631432678

In [76]:
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score

In [77]:
y_pred = model.predict(X_test)
y_pred

array([9.35761373, 9.10305988, 9.65777164, ..., 9.85109743, 9.34060668,
       9.33076868], shape=(21468,))

In [78]:
y_test

25006     8.66
72982     7.08
40364    11.19
45713     5.37
27305     6.80
         ...  
72250    19.71
42709     6.09
35932    15.19
75073     9.97
20234     8.03
Name: price, Length: 21468, dtype: float64

In [79]:
print("R²:", r2_score(y_test, y_pred))

R²: 0.006796919753468078


In [80]:
models = {
    "lasso": Lasso(),
    "ridge": Ridge(),
    "elastic": ElasticNet()
}

for k, v in models.items():
    k = make_pipeline(preprocessor, v)
    k.fit(X_train, y_train)
    pred = k.predict(X_test)
    print("R²:", r2_score(y_test, pred))
    print("_" * 20)

R²: 0.07516094454817734
____________________
R²: 0.006796919757127151
____________________
R²: 0.09703031883040303
____________________


In [62]:
import joblib
import pickle

In [66]:
pickle.dump(model, open("regmodel.pkl", "wb"))

In [67]:
X_train.head()

Unnamed: 0,year,month,stateDescription,sectorName,customers,sales
5714,2002,7,Idaho,other,2916013.0,31.35882
12737,2004,6,New Hampshire,transportation,2916013.0,0.0
16870,2005,7,Delaware,transportation,2916013.0,0.0
49132,2014,3,Missouri,industrial,10272.0,1426.09446
61668,2017,7,Nevada,industrial,3618.0,1148.51307
