## The following notebook creates synthetic data from a LLM that was fine-tuned on the California Housing Dataset

The output of this notebook can be used to create a Community Competition similar to this [Playground competition](https://www.kaggle.com/competitions/playground-series-s3e1) using refreshed data.

In [1]:
import os
import pandas as pd
from hashlib import md5

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
model_path = '/kaggle/input/california-housing-fine-tuned-gpt2-medium'

/kaggle/input/california-housing-fine-tuned-gpt2-medium/config.json
/kaggle/input/california-housing-fine-tuned-gpt2-medium/model.pt
/kaggle/input/playground-series-s3e1/sample_submission.csv
/kaggle/input/playground-series-s3e1/train.csv
/kaggle/input/playground-series-s3e1/test.csv


In [2]:
!pip install transformers==4.26.1 -q # avoiding a dependency issue

In [3]:
!pip install be-great==0.0.3 -q

In [4]:
from be_great import GReaT

model = GReaT.load_from_dir(model_path)
data = model.sample(
    n_samples=1_000,    # change this to generate more samples
    k=50,
    temperature=0.7,  # values between 0.5-0.9 generally give good results
    max_length=256,
    device="cuda")
data = data.rename_axis('id')
data.head()

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

1003it [00:51, 19.34it/s]


Unnamed: 0_level_0,Median Income,Age of House,Average Number of Rooms,Average Number of Bedrooms,Population,Average Occupancy,Latitude,Longitude,Median House Value
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2.45,25.0,3.273438,1.0,1207.0,2.846354,33.96,-118.36,1.25
1,3.4018,29.0,3.980494,1.098793,2352.0,1.773756,34.11,-118.35,3.5
2,4.2813,17.0,5.427894,0.976281,2672.0,2.455041,33.97,-117.95,1.694
3,6.0604,21.0,7.171123,0.991736,1644.0,3.3925,38.71,-121.17,2.289
4,5.6015,24.0,6.087397,1.028519,1432.0,3.031674,34.11,-117.84,1.961


## You may need/want to do data cleanup here

* Remove outliers
* Ensure Lat/Lon make sense
* etc.

In [5]:
# Need to rename columns to canonical feature names

column_map = {'Median Income': 'MedInc',
              'Age of House': 'HouseAge',
              'Average Number of Rooms': 'AveRooms',
              'Average Number of Bedrooms': 'AveBedrms',
              'Average Occupancy': 'AveOccup',
              'Median House Value': 'MedHouseVal'}

data = data.rename(columns=column_map)

## Create new competition files for a Community Competition

In [6]:
train_split = 0.6
public_split = 0.2

train_ids = range(int(data.shape[0] * train_split))
train = data.loc[train_ids]
test = data.loc[train_ids[-1]+1:]

solution = test.pop('MedHouseVal').to_frame()

# random sort using hash
solution['sort'] = solution.index.astype(str) + solution['MedHouseVal'].astype(str)
solution['sort'] = solution['sort'].apply(lambda x: md5(bytes(x, 'utf=8')).hexdigest())
assert solution['sort'].is_unique

solution = solution.sort_values('sort')
solution['Usage'] = 'Private'

public_count = int(solution.shape[0] * public_split)
solution.iloc[:public_count, 2] = 'Public'

solution = solution.drop('sort', axis='columns').sort_index()

# sample submission to use the mean of the training data
submission = solution[['MedHouseVal']].copy().assign(MedHouseVal=train['MedHouseVal'].mean())

In [7]:
train.to_csv('train.csv')
test.to_csv('test.csv')
submission.to_csv('sample_submission.csv')
solution.to_csv('solution.csv')

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [9]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e1/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e1/test.csv')

In [10]:
print(train_data.head())
print(train_data.info())

   id  MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0   0  2.3859      15.0  3.827160   1.112100      1280.0  2.486989     34.60   
1   1  3.7188      17.0  6.013373   1.054217      1504.0  3.813084     38.69   
2   2  4.7750      27.0  6.535604   1.103175      1061.0  2.464602     34.71   
3   3  2.4138      16.0  3.350203   0.965432      1255.0  2.089286     32.66   
4   4  3.7500      52.0  4.284404   1.069246      1793.0  1.604790     37.80   

   Longitude  MedHouseVal  
0    -120.12        0.980  
1    -121.22        0.946  
2    -120.45        1.576  
3    -117.09        1.336  
4    -122.41        4.500  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37137 entries, 0 to 37136
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           37137 non-null  int64  
 1   MedInc       37137 non-null  float64
 2   HouseAge     37137 non-null  float64
 3   AveRooms     37137 non-null

In [11]:
X = train_data.drop(['id', 'MedHouseVal'], axis=1)
y = train_data['MedHouseVal']

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [14]:
model.fit(X_train, y_train)


In [15]:
val_predictions = model.predict(X_val)

rmse = mean_squared_error(y_val, val_predictions, squared=False)
print(f'Root Mean Squared Error on Validation Set: {rmse}')

Root Mean Squared Error on Validation Set: 0.614959635507734


In [16]:
test_features = test_data.drop('id', axis=1)

test_predictions = model.predict(test_features)

submission_df = pd.DataFrame({'id': range(37137, 37137 + len(test_data)), 'MedHouseVal': test_predictions})




In [17]:
print(len(submission_df))
print(len(test_predictions))

24759
24759


In [18]:
print(submission_df)


          id  MedHouseVal
0      37137     0.606620
1      37138     0.995460
2      37139     3.711931
3      37140     3.454893
4      37141     2.415920
...      ...          ...
24754  61891     1.980260
24755  61892     1.916770
24756  61893     1.157240
24757  61894     3.987671
24758  61895     3.460520

[24759 rows x 2 columns]


In [19]:
submission_df.to_csv('submission.csv', index=False)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [21]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
}

In [22]:
model = RandomForestRegressor(random_state=42)

In [23]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)


**Training the data!**


In [24]:
grid_search.fit(X_train, y_train)

In [25]:
print("Best Hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

val_predictions = best_model.predict(X_val)

Best Hyperparameters: {'max_depth': None, 'n_estimators': 150}


In [26]:
rmse = mean_squared_error(y_val, val_predictions, squared=False)
print(f'Root Mean Squared Error on Validation Set with Grid Search: {rmse}')

Root Mean Squared Error on Validation Set with Grid Search: 0.6134939292469728


In [27]:
test_predictions = best_model.predict(test_features)

submission_df = pd.DataFrame({'id': range(37137, 37137 + len(test_data)), 'MedHouseVal': test_predictions})

print(submission_df)

          id  MedHouseVal
0      37137     0.607853
1      37138     0.991587
2      37139     3.609887
3      37140     3.415829
4      37141     2.480707
...      ...          ...
24754  61891     1.959107
24755  61892     1.950180
24756  61893     1.173273
24757  61894     4.038461
24758  61895     3.429960

[24759 rows x 2 columns]


In [28]:
submission_df.to_csv('submission_grid_search.csv', index=False)


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Assuming 'train_data' is loaded and processed as before

# Split the data into training and validation sets
X = train_data.drop(['id', 'MedHouseVal'], axis=1)
y = train_data['MedHouseVal']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
    # Add other hyperparameters as needed
}

# Initialize the XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Initialize GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)

# Fit the grid search to the data
grid_search_xgb.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters for XGBoost:", grid_search_xgb.best_params_)

# Get the best XGBoost model from grid search
best_xgb_model = grid_search_xgb.best_estimator_

# Make predictions on the validation set
val_predictions_xgb = best_xgb_model.predict(X_val)

# Calculate RMSE for XGBoost
rmse_xgb = mean_squared_error(y_val, val_predictions_xgb, squared=False)
print(f'Root Mean Squared Error on Validation Set with XGBoost: {rmse_xgb}')

# Assuming 'test_data' is loaded and processed as before
test_features = test_data.drop('id', axis=1)

# Make predictions on the test set using the best XGBoost model
test_predictions_xgb = best_xgb_model.predict(test_features)

# Create a submission DataFrame with adjusted IDs
submission_df_xgb = pd.DataFrame({'id': range(37137, 37137 + len(test_data)), 'MedHouseVal': test_predictions_xgb})

# Print the submission DataFrame
print(submission_df_xgb)

# Save the submission file
submission_df_xgb.to_csv('submission_xgb.csv', index=False)


Best Hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}
Root Mean Squared Error on Validation Set with XGBoost: 0.597987194827306
          id  MedHouseVal
0      37137     0.731977
1      37138     1.026497
2      37139     3.960536
3      37140     3.493225
4      37141     2.299681
...      ...          ...
24754  61891     2.139527
24755  61892     1.941486
24756  61893     1.323384
24757  61894     3.849522
24758  61895     3.772335

[24759 rows x 2 columns]
