# Load the Dataset
Load the 'house_prices.csv' dataset using pandas.

In [None]:
# Importing necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('house_prices.csv')

# Display the first 5 rows of the dataframe
df.head()


# Visualizing Data with Seaborn
Create various graphs to visualize the data and its distributions using seaborn.

In [None]:
import numpy as np
# Importing numpy

# Boxplot for each numeric column to identify outliers
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[col])
    plt.title('Boxplot - {}'.format(col))
    plt.show()

# Scatterplot for SalePrice against each numeric column
for col in numeric_cols:
    if col != 'SalePrice':
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=df[col], y=df['SalePrice'])
        plt.title('SalePrice vs {}'.format(col))
    plt.show()

# Countplot for each categorical column
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=df[col])
    plt.title('Countplot - {}'.format(col))
    plt.xticks(rotation=90)
    plt.show() 

# Violinplot for SalePrice against each categorical column
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.violinplot(x=df[col], y=df['SalePrice'])
    plt.title('SalePrice vs {}'.format(col))
    plt.xticks(rotation=90)
    plt.show()

# Select Numeric and Categorical Attributes
Select a numeric and a categorical attribute that have a relationship for further analysis.

In [None]:
# Selecting a numeric and a categorical attribute for further analysis
# For this example, let's select 'OverallQual' as the categorical attribute and 'SalePrice' as the numeric attribute
# These attributes are selected based on the correlation matrix and the violin plots

numeric_attribute = 'SalePrice'
categorical_attribute = 'OverallQual'

# Display the selected attributes
print("Selected Numeric Attribute: ", numeric_attribute)
print("Selected Categorical Attribute: ", categorical_attribute)

# Plotting the relationship between the selected numeric and categorical attributes
plt.figure(figsize=(10, 6))
sns.boxplot(x=df[categorical_attribute], y=df[numeric_attribute])
plt.title('Relationship between {} and {}'.format(categorical_attribute, numeric_attribute))
plt.xticks(rotation=90)
plt.show()

# Linear Regression Model
Build a Linear Regression model using the selected attributes.

In [None]:
# Importing necessary libraries for Linear Regression

from %pip install scikit-learn
sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Preparing the data for the Linear Regression model
# Using the selected numeric attribute as the target variable and the selected categorical attribute as the predictor
X = df[[categorical_attribute]]
y = df[numeric_attribute]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the Linear Regression model
model = LinearRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Printing the evaluation metrics
print("Mean Squared Error: ", mse)
print("R^2 Score: ", r2)

# Decision Tree Regressor
Use a Decision Tree Regressor to explain how the Linear Regression model makes decisions.

In [None]:
# Importing necessary libraries for Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor, plot_tree

# Creating the Decision Tree Regressor model
tree_model = DecisionTreeRegressor(random_state=42)

# Training the model
tree_model.fit(X_train, y_train)

# Making predictions
y_pred_tree = tree_model.predict(X_test)

# Evaluating the model
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

# Printing the evaluation metrics
print("Mean Squared Error (Decision Tree): ", mse_tree)
print("R^2 Score (Decision Tree): ", r2_tree)

# Plotting the Decision Tree
plt.figure(figsize=(20,10))
plot_tree(tree_model, filled=True, feature_names=[categorical_attribute])
plt.title('Decision Tree')
plt.show()

# Logistic Regression Model
Build a Logistic Regression model using the selected attributes.

In [None]:
# Importing necessary libraries for Logistic Regression

from %pip install sklearn
sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Preparing the data for the Logistic Regression model
# Converting the numeric target variable into a binary variable
# For this example, let's consider houses with a price above the median as 'expensive' (1) and the rest as 'not expensive' (0)
y_binary = (y > y.median()).astype(int)

# Splitting the data into training and testing sets
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Creating the Logistic Regression model
logistic_model = LogisticRegression()

# Training the model
logistic_model.fit(X_train, y_train_binary)

# Making predictions
y_pred_binary = logistic_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test_binary, y_pred_binary)
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Printing the evaluation metrics
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)

# Decision Tree Classifier
Use a Decision Tree Classifier to explain how the Logistic Regression model makes decisions.

In [None]:
# Importing necessary libraries for Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import Image  
import %pip install pydotplus
pydotplus

# Preparing the data for the Decision Tree Classifier model
# Using the binary target variable from the Logistic Regression model
y_train_classifier = y_train_binary
y_test_classifier = y_test_binary

# Creating the Decision Tree Classifier model
classifier_model = DecisionTreeClassifier(random_state=42)

# Training the model
classifier_model.fit(X_train, y_train_classifier)

# Making predictions
y_pred_classifier = classifier_model.predict(X_test)

# Evaluating the model
accuracy_classifier = accuracy_score(y_test_classifier, y_pred_classifier)
conf_matrix_classifier = confusion_matrix(y_test_classifier, y_pred_classifier)

# Printing the evaluation metrics
print("Accuracy (Decision Tree Classifier): ", accuracy_classifier)
print("Confusion Matrix (Decision Tree Classifier): \n", conf_matrix_classifier)

# Visualizing the Decision Tree
dot_data = tree.export_graphviz(classifier_model, out_file=None, 
                                feature_names=[categorical_attribute],  
                                class_names=['not expensive', 'expensive'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())


La validación cruzada es una técnica utilizada para evaluar la eficacia de un modelo de aprendizaje automático. Existen varios métodos de validación cruzada, cada uno con sus propias ventajas y desventajas:

1. **K-Fold Cross Validation**: En este método, el conjunto de datos se divide en 'k' subconjuntos. Uno de los subconjuntos se utiliza como conjunto de prueba y el resto como conjunto de entrenamiento. El proceso se repite 'k' veces, cada vez con un subconjunto diferente como conjunto de prueba. Este método es útil cuando se dispone de un tamaño de muestra grande.

2. **Stratified K-Fold Cross Validation**: Este método es una variante de K-Fold que puede producir un sesgo y una varianza más bajos en casos en los que el conjunto de datos no está equilibrado. En Stratified K-Fold, se mantiene la proporción de cada clase objetivo en cada pliegue.

3. **Shuffle Split**: Este método genera un número predefinido de conjuntos de entrenamiento y prueba independientes. Los conjuntos de prueba y entrenamiento se generan dividiendo aleatoriamente el conjunto de datos completo. Este método es útil cuando se quiere un control más directo sobre el número de iteraciones y el tamaño de los conjuntos de prueba y entrenamiento.

Cada uno de estos métodos tiene sus propias ventajas y puede ser más útil en ciertos escenarios. La elección del método de validación cruzada depende en gran medida del conjunto de datos y del problema específico que se está tratando de resolver.

En este notebook, hemos explorado diferentes métodos de validación cruzada y cómo pueden impactar en la eficacia de un modelo de aprendizaje automático. Hemos discutido las ventajas y desventajas de K-Fold, Stratified K-Fold y Shuffle Split. Cada método tiene sus propias fortalezas y puede ser más adecuado para ciertos escenarios. La elección del método de validación cruzada debe basarse en el conjunto de datos específico y el problema que se está tratando de resolver. En resumen, la validación cruzada es una herramienta esencial para evaluar la eficacia de un modelo y seleccionar el método de validación cruzada más adecuado puede mejorar significativamente el rendimiento del modelo.