In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = "/content/sample_data/sample_submission.csv"
data = pd.read_csv(url)

# Preprocessing
data = data.dropna()  # Removing missing values
data = pd.get_dummies(data, drop_first=True)  # Encoding categorical data

# Feature Selection
X = data.drop('SalePrice', axis=1)  # Features
y = data['SalePrice']  # Target variable

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 239521036.51357394
R^2 Score: 0.008827921820605544


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = "/content/sample_data/sample_submission.csv"
data = pd.read_csv(url)

# Since there are no features other than 'Id', we cannot proceed with a meaningful model.
# Normally, you would need more features to predict 'SalePrice'.

# For demonstration, let’s assume we have some other features (this is just a placeholder)
# Example: Uncomment if you have a more complete dataset
# data['TotalRooms'] = data['LivingArea'] + data['BasementArea']
# data['TotalBath'] = data['FullBathrooms'] + data['HalfBathrooms']

# For now, we will split the SalePrice column into X and y (which doesn't make sense here)
X = data[['Id']]  # This is just for demonstration; normally, you would use real features.
y = data['SalePrice']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 239521036.51357394
R^2 Score: 0.008827921820605544


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
url = "/content/sample_data/california_housing_test.csv"  # Ganti dengan path ke dataset Anda
data = pd.read_csv(url)

# Menampilkan kolom yang tersedia dan baris pertama
print("Available columns in the dataset:")
print(data.columns)

print("\nFirst few rows of the dataset:")
print(data.head())

# Memilih fitur yang akan digunakan untuk prediksi
X = data[['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']]
y = data['median_house_value']

# Membagi data menjadi set pelatihan dan set pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat dan melatih model regresi linear
model = LinearRegression()
model.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian
y_pred = model.predict(X_test)

# Menghitung Mean Squared Error (MSE) dan R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Available columns in the dataset:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

First few rows of the dataset:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = "/content/sample_data/california_housing_test.csv"
data = pd.read_csv(url)

# Verifikasi Nama Kolom
print("Available columns in the dataset:")
print(data.columns)

# Lihat beberapa baris pertama dari dataset
print("\nFirst few rows of the dataset:")
print(data.head())

# Misalkan nama kolom yang sebenarnya adalah 'LivingArea', 'BasementArea', 'FullBathrooms', dan 'HalfBathrooms'
# Sesuaikan nama kolom ini dengan yang ada dalam dataset Anda
# Jika nama kolom dalam dataset berbeda, ubah kode di bawah ini sesuai

# Feature Engineering: Create new features
data['TotalRooms'] = data['LivingArea'] + data['BasementArea']
data['TotalBath'] = data['FullBathrooms'] + data['HalfBathrooms']

# Encoding categorical variables
data = pd.get_dummies(data, drop_first=True)

# Feature and target variable selection
X = data.drop('SalePrice', axis=1)  # Sesuaikan 'SalePrice' jika namanya berbeda
y = data['SalePrice']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building with RandomForest
model = RandomForestRegressor(random_state=42)

# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Predictions with the best model
y_pred = best_model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Available columns in the dataset:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

First few rows of the dataset:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359

KeyError: 'LivingArea'