# Multiple Linear Regression

## Importing the libraries

In [1]:
# Importación de librerías necesarias
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib  # Para guardar el modelo en formato .pkl


## Importing the dataset

In [13]:
dataset = pd.read_csv('C:/Users/Admin/Downloads/Data Mining (Codes and Datasets)/app/data/Multiple Linear Regression/50_Startups.csv')

dataset.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [14]:
# Codificación de variables categóricas (si aplica)
# Suponiendo que hay una columna "State" que se convierte en variables dummies
dataset = pd.get_dummies(dataset, columns=['State'], drop_first=True)

In [15]:
dataset.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [16]:
# Separar variables independientes (X) y dependientes (y)
X = dataset.drop(columns=['Profit'])
y = dataset['Profit']


dataset.head()


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


## Encoding categorical data

In [17]:
# División del dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [19]:
print(X_test)

    R&D Spend  Administration  Marketing Spend  State_Florida  State_New York
13   91992.39       135495.07        252664.93              0               0
39   38558.51        82982.09        174999.30              0               0
30   61994.48       115641.28         91131.24              1               0
45    1000.23       124153.04          1903.93              0               1
17   94657.16       145077.58        282574.31              0               1
48     542.05        51743.15             0.00              0               1
26   75328.87       144135.98        134050.07              1               0
25   64664.71       139553.16        137962.62              0               0
32   63408.86       129219.61         46085.25              0               0
19   86419.70       153514.11             0.00              0               1
12   93863.75       127320.38        249839.44              1               0
4   142107.34        91391.77        366168.42              1   

## Splitting the dataset into the Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [21]:
# Inicialización y entrenamiento del modelo de regresión lineal
model = LinearRegression()
model.fit(X_train, y_train)


## Predicting the Test set results

In [24]:
y_pred = model.predict(X_test)
np.set_printoptions(precision=2)

# Convertir y_test a un arreglo de NumPy antes de aplicar reshape
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1))


[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [26]:
# Guardar el modelo entrenado en un archivo .pkl
model_path = 'C:/Users/Admin/Downloads/Data Mining (Codes and Datasets)/app/Models/Multiple Linear Regression/multiple_linear_regression_model.pkl'
joblib.dump(model, model_path)
print(f"Modelo guardado en {model_path}")


Modelo guardado en C:/Users/Admin/Downloads/Data Mining (Codes and Datasets)/app/Models/Multiple Linear Regression/multiple_linear_regression_model.pkl


In [27]:
# Cargar el modelo guardado
loaded_model = joblib.load(model_path)

# Probar el modelo cargado en una muestra del conjunto de prueba
sample_prediction = loaded_model.predict(X_test[:5])
print("Predicción del modelo cargado para una muestra del conjunto de prueba:", sample_prediction)


Predicción del modelo cargado para una muestra del conjunto de prueba: [103015.2  132582.28 132447.74  71976.1  178537.48]
