In [40]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv("data.csv")

# Drop rows with missing values
data.fillna(0,inplace=True)

# Encode categorical variables if any
# If there are categorical variables, you can encode them here using pd.get_dummies()

# Check the data after preprocessing
print(data.head())


     date            name  INCOME  EXPENDITURE  Room Hire  Day rate  \
0  23-Sep  EPA 0 C Graham  1501.6       431.54        0.0     300.0   
1  05-Oct  EPA - C Barker  1410.0       449.37        0.0     320.0   
2  21-Oct  EPA - J Siddle  1438.8       471.51        0.0     300.0   
3  25-Oct   EPA - H Shord  1350.0       300.00        0.0     300.0   
4  27-Oct   EPA - S Young  1430.0       517.00        0.0     300.0   

   Moderation  Other/IQA   Shadow/Obsv/Invig/Mod/IQA    Mileage/travel   \
0         0.0        0.0                          0.0            131.54   
1         0.0        0.0                          0.0            124.12   
2         0.0        0.0                          0.0             79.76   
3         0.0        0.0                          0.0              0.00   
4         0.0        0.0                          0.0            114.00   

    B&B/Subs    Total      Net Margin  
0        0.00  431.54  1070.06    71%  
1        5.25  449.37   960.63    68%  
2 

In [41]:
data    

Unnamed: 0,date,name,INCOME,EXPENDITURE,Room Hire,Day rate,Moderation,Other/IQA,Shadow/Obsv/Invig/Mod/IQA,Mileage/travel,B&B/Subs,Total,Net,Margin
0,23-Sep,EPA 0 C Graham,1501.6,431.54,0.0,300.0,0.0,0.0,0.0,131.54,0.0,431.54,1070.06,71%
1,05-Oct,EPA - C Barker,1410.0,449.37,0.0,320.0,0.0,0.0,0.0,124.12,5.25,449.37,960.63,68%
2,21-Oct,EPA - J Siddle,1438.8,471.51,0.0,300.0,0.0,0.0,0.0,79.76,91.75,471.51,967.29,67%
3,25-Oct,EPA - H Shord,1350.0,300.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,300.0,1050.0,78%
4,27-Oct,EPA - S Young,1430.0,517.0,0.0,300.0,0.0,0.0,0.0,114.0,103.0,517.0,913.0,64%
5,03-Nov,EPA - S Stevenson,1760.0,1251.7,0.0,490.0,0.0,0.0,0.0,502.38,259.32,1251.7,508.3,29%
6,08-Nov,EPA - R Harris,1760.0,904.11,0.0,490.0,0.0,0.0,0.0,146.16,267.95,904.11,855.89,49%
7,13-Nov,EPA - A Smith,1620.0,1449.29,0.0,700.0,0.0,0.0,0.0,492.07,257.22,1449.29,170.71,11%
8,10-Nov,EPA - A Thomson,1350.0,354.96,0.0,335.0,0.0,0.0,0.0,19.96,0.0,354.96,995.04,74%
9,27-Nov,EPA - C Guy,1620.0,868.67,0.0,665.0,0.0,0.0,0.0,203.67,0.0,868.67,751.33,46%


In [42]:
# Preprocess 'Margin' column
data['Margin'] = data['Margin'].str.rstrip('%').astype(float) / 100

In [43]:
data

Unnamed: 0,date,name,INCOME,EXPENDITURE,Room Hire,Day rate,Moderation,Other/IQA,Shadow/Obsv/Invig/Mod/IQA,Mileage/travel,B&B/Subs,Total,Net,Margin
0,23-Sep,EPA 0 C Graham,1501.6,431.54,0.0,300.0,0.0,0.0,0.0,131.54,0.0,431.54,1070.06,0.71
1,05-Oct,EPA - C Barker,1410.0,449.37,0.0,320.0,0.0,0.0,0.0,124.12,5.25,449.37,960.63,0.68
2,21-Oct,EPA - J Siddle,1438.8,471.51,0.0,300.0,0.0,0.0,0.0,79.76,91.75,471.51,967.29,0.67
3,25-Oct,EPA - H Shord,1350.0,300.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,300.0,1050.0,0.78
4,27-Oct,EPA - S Young,1430.0,517.0,0.0,300.0,0.0,0.0,0.0,114.0,103.0,517.0,913.0,0.64
5,03-Nov,EPA - S Stevenson,1760.0,1251.7,0.0,490.0,0.0,0.0,0.0,502.38,259.32,1251.7,508.3,0.29
6,08-Nov,EPA - R Harris,1760.0,904.11,0.0,490.0,0.0,0.0,0.0,146.16,267.95,904.11,855.89,0.49
7,13-Nov,EPA - A Smith,1620.0,1449.29,0.0,700.0,0.0,0.0,0.0,492.07,257.22,1449.29,170.71,0.11
8,10-Nov,EPA - A Thomson,1350.0,354.96,0.0,335.0,0.0,0.0,0.0,19.96,0.0,354.96,995.04,0.74
9,27-Nov,EPA - C Guy,1620.0,868.67,0.0,665.0,0.0,0.0,0.0,203.67,0.0,868.67,751.33,0.46


In [44]:
# Drop 'date' and 'name' columns
X = data.drop(columns=['Net', 'Margin', 'date', 'name'])
y = data['Margin']


In [45]:
X

Unnamed: 0,INCOME,EXPENDITURE,Room Hire,Day rate,Moderation,Other/IQA,Shadow/Obsv/Invig/Mod/IQA,Mileage/travel,B&B/Subs,Total
0,1501.6,431.54,0.0,300.0,0.0,0.0,0.0,131.54,0.0,431.54
1,1410.0,449.37,0.0,320.0,0.0,0.0,0.0,124.12,5.25,449.37
2,1438.8,471.51,0.0,300.0,0.0,0.0,0.0,79.76,91.75,471.51
3,1350.0,300.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,300.0
4,1430.0,517.0,0.0,300.0,0.0,0.0,0.0,114.0,103.0,517.0
5,1760.0,1251.7,0.0,490.0,0.0,0.0,0.0,502.38,259.32,1251.7
6,1760.0,904.11,0.0,490.0,0.0,0.0,0.0,146.16,267.95,904.11
7,1620.0,1449.29,0.0,700.0,0.0,0.0,0.0,492.07,257.22,1449.29
8,1350.0,354.96,0.0,335.0,0.0,0.0,0.0,19.96,0.0,354.96
9,1620.0,868.67,0.0,665.0,0.0,0.0,0.0,203.67,0.0,868.67


In [46]:
y

0     0.71
1     0.68
2     0.67
3     0.78
4     0.64
5     0.29
6     0.49
7     0.11
8     0.74
9     0.46
10    1.00
11    0.74
12    0.74
13    0.72
14    0.66
15    0.63
16    0.59
17    0.43
18    1.00
19    0.47
20    1.00
21    0.41
22    0.74
23    0.49
24    0.56
25    0.60
26    0.55
27    0.69
28    0.77
29    0.73
30    0.51
31    1.00
32    0.60
33    0.11
34    0.39
35    0.17
36    1.00
37   -3.83
Name: Margin, dtype: float64

In [47]:


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LinearRegression()

In [48]:
# Train the model
model.fit(X_train, y_train)


In [49]:

# Make predictions on the test set
y_pred = model.predict(X_test)

y_pred

array([ 1.25844041,  1.84532527,  0.83916437,  0.43500487,  0.35849057,
       -0.03503184,  0.39831636,  0.17062235])

In [50]:
from sklearn.metrics import mean_absolute_error

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.4781909665560481


In [51]:
import joblib

# Save the model to a file
joblib.dump(model, "predictive_model.pkl")


['predictive_model.pkl']