In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Read the CSV file into a DataFrame
df = pd.read_csv('Base_ScoreCredito_QuantumFinance.csv', delimiter=';')

# Create a copy of the dataframe
df_encoded = df.copy()

# Convert `estado_civil` and `escola` to categorical data type
df_encoded['estado_civil'] = df_encoded['estado_civil'].astype('category')
df_encoded['escola'] = df_encoded['escola'].astype('category')

# One-hot encode the `estado_civil` and `escola` columns
df_encoded = pd.get_dummies(df_encoded, columns=['estado_civil', 'escola'])

# Convert the column `vl_salario_mil` to numeric
df_encoded['vl_salario_mil'] = df_encoded['vl_salario_mil'].astype(str).str.replace(',', '.')
df_encoded['vl_salario_mil'] = pd.to_numeric(df_encoded['vl_salario_mil'])

# Standardize the numerical features
for column in df_encoded.select_dtypes(include=['int64', 'float64']):
    if column != 'SCORE_CREDITO':  # Exclude the target variable from standardization
        df_encoded[column] = (df_encoded[column] - df_encoded[column].mean()) / df_encoded[column].std()

# Drop the column `id`
df_encoded = df_encoded.drop('id', axis=1)

# Convert the column `sexo` to numeric
df_encoded['sexo'] = df_encoded['sexo'].replace({'M': 1, 'F': 0})

# Convert the column `SCORE_CREDITO` to numeric
df_encoded['SCORE_CREDITO'] = df_encoded['SCORE_CREDITO'].astype(str).str.replace(',', '.')
df_encoded['SCORE_CREDITO'] = pd.to_numeric(df_encoded['SCORE_CREDITO'])

# Split the data into features and target
X = df_encoded.drop('SCORE_CREDITO', axis=1)
y = df_encoded['SCORE_CREDITO']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the Linear Regression model on the train set
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set using R-squared and RMSE metrics
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the results
print("Linear Regression:")
print(f"R-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# Initialize and train the Decision Tree Regressor model
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
r2_tree = r2_score(y_test, y_pred_tree)
rmse_tree = mean_squared_error(y_test, y_pred_tree, squared=False)

# Print the results
print("Decision Tree Regressor:")
print(f"R-squared: {r2_tree:.4f}")
print(f"RMSE: {rmse_tree:.4f}")

Linear Regression:
R-squared: 0.6200
RMSE: 81.5961
Decision Tree Regressor:
R-squared: 0.5923
RMSE: 84.5259


  df_encoded['sexo'] = df_encoded['sexo'].replace({'M': 1, 'F': 0})
