In [16]:
!pip install xlrd
!pip install tensorflow




In [12]:
import pandas as pd
import os

# Verify if the file exists
input_file_path = r'C:/Users/ravip/Downloads/X_Train_Data_Input.xls'
target_file_path = r'C:/Users/ravip/Downloads/Y_Train_Data_Target.xls'

# Check if the paths exist
print("Checking file paths...")
print(f"X_Train_Data_Input exists: {os.path.exists(input_file_path)}")
print(f"Y_Train_Data_Target exists: {os.path.exists(target_file_path)}")

if os.path.exists(input_file_path) and os.path.exists(target_file_path):
    # Step 1: Load the dataset
    X_train_data_input = pd.read_excel(input_file_path)
    y_train_data_target = pd.read_excel(target_file_path)
    
    print("Files loaded successfully!")
else:
    print("Error: One or both file paths are incorrect.")


Checking file paths...
X_Train_Data_Input exists: True
Y_Train_Data_Target exists: True
Files loaded successfully!


In [3]:
import os

file_path = r'D:\dataset\X_Train_Data_Input.csv'
print(os.path.exists(file_path))


False


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

input_file_path = 'C:/Users/ravip/Downloads/X_Train_Data_Input.xls'
target_file_path = 'C:/Users/ravip/Downloads/Y_Train_Data_Target.xls'

X_train_data_input = pd.read_excel(input_file_path)
y_train_data_target = pd.read_excel(target_file_path)

X_train_data_input = X_train_data_input.iloc[:20000, 1:]  
y_train_data_target = y_train_data_target.iloc[:20000, 1]  

# Fill NaN values with the mean of each column
X_train_data_input.fillna(X_train_data_input.mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X_train_data_input, y_train_data_target, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Squared Error: {linear_mse}')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {rf_mse}')

mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=20000, random_state=42)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
mlp_mse = mean_squared_error(y_test, y_pred_mlp)
print(f'MLP Mean Squared Error: {mlp_mse}')



Linear Regression Mean Squared Error: 0.038555240831427175
Random Forest Mean Squared Error: 0.019979649999999998
MLP Mean Squared Error: 15.995743078861503


In [38]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neural_network import MLPRegressor

# Load Data
input_file_path = 'C:/Users/ravip/Downloads/X_Train_Data_Input.xls'
target_file_path = 'C:/Users/ravip/Downloads/Y_Train_Data_Target.xls'

X_train_data_input = pd.read_excel(input_file_path)
y_train_data_target = pd.read_excel(target_file_path)

X_train_data_input = X_train_data_input.iloc[:20000, 1:]
y_train_data_target = y_train_data_target.iloc[:20000, 1]

# Fill NaN values with the mean of each column
X_train_data_input.fillna(X_train_data_input.mean(), inplace=True)

# Feature Engineering
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_data_input)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k='all')
X_selected = selector.fit_transform(X_scaled, y_train_data_target)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_train_data_target, test_size=0.2, random_state=42)

# Model Training and Optimization

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Squared Error: {linear_mse}')

# 2. Random Forest Regressor with GridSearchCV for parameter optimization
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
y_pred_rf = rf_grid_search.best_estimator_.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {rf_mse}')
print(f'Best Parameters for Random Forest: {rf_grid_search.best_params_}')

mlp_model = MLPRegressor(max_iter=1000, random_state=42)
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}
mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid_mlp, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
mlp_grid_search.fit(X_train, y_train)
y_pred_mlp = mlp_grid_search.best_estimator_.predict(X_test)
mlp_mse = mean_squared_error(y_test, y_pred_mlp)
print(f'MLP Mean Squared Error: {mlp_mse}')
print(f'Best Parameters for MLP: {mlp_grid_search.best_params_}')


Linear Regression Mean Squared Error: 0.03855524083142717
Random Forest Mean Squared Error: 0.019529290762776052
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
MLP Mean Squared Error: 0.022108218156571526
Best Parameters for MLP: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'}


In [39]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neural_network import MLPRegressor

# Load Data
input_file_path = 'C:/Users/ravip/Downloads/X_Train_Data_Input.xls'
target_file_path = 'C:/Users/ravip/Downloads/Y_Train_Data_Target.xls'

X_train_data_input = pd.read_excel(input_file_path)
y_train_data_target = pd.read_excel(target_file_path)

# Use only the first 2000 rows
X_train_data_input = X_train_data_input.iloc[:2000, 1:]  # Adjust to use 2000 rows
y_train_data_target = y_train_data_target.iloc[:2000, 1]

# Fill NaN values with the mean of each column
X_train_data_input.fillna(X_train_data_input.mean(), inplace=True)

# Feature Engineering
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_data_input)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k='all')
X_selected = selector.fit_transform(X_scaled, y_train_data_target)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_train_data_target, test_size=0.2, random_state=42)

# Model Training and Optimization

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Squared Error: {linear_mse}')

# 2. Random Forest Regressor with GridSearchCV for parameter optimization
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
y_pred_rf = rf_grid_search.best_estimator_.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {rf_mse}')
print(f'Best Parameters for Random Forest: {rf_grid_search.best_params_}')

# 3. MLP Regressor with GridSearchCV for parameter optimization
mlp_model = MLPRegressor(max_iter=2000, random_state=42)  # Changed max_iter to 2000
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}
mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid_mlp, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
mlp_grid_search.fit(X_train, y_train)
y_pred_mlp = mlp_grid_search.best_estimator_.predict(X_test)
mlp_mse = mean_squared_error(y_test, y_pred_mlp)
print(f'MLP Mean Squared Error: {mlp_mse}')
print(f'Best Parameters for MLP: {mlp_grid_search.best_params_}')


Linear Regression Mean Squared Error: 1.5863728654133127
Random Forest Mean Squared Error: 0.017812540244835858
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
MLP Mean Squared Error: 0.02373162751136616
Best Parameters for MLP: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'}


In [41]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.neural_network import MLPRegressor

# Load Data
input_file_path = 'C:/Users/ravip/Downloads/X_Train_Data_Input.xls'
target_file_path = 'C:/Users/ravip/Downloads/Y_Train_Data_Target.xls'

# Read the input data
X_train_data_input = pd.read_excel(input_file_path)
y_train_data_target = pd.read_excel(target_file_path)

# Check for non-numeric columns
non_numeric_cols = X_train_data_input.select_dtypes(exclude=[np.number]).columns.tolist()
print(f'Non-numeric columns: {non_numeric_cols}')

# Optionally, you can convert or drop non-numeric columns
# For example, you could drop them:
X_train_data_input = X_train_data_input.drop(columns=non_numeric_cols)

# Or if you need to convert categorical data to numeric, you can use one-hot encoding or label encoding.
# X_train_data_input = pd.get_dummies(X_train_data_input, drop_first=True)

# Fill NaN values with the mean of each numeric column
X_train_data_input.fillna(X_train_data_input.mean(), inplace=True)

# Feature Engineering
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_data_input)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k='all')
X_selected = selector.fit_transform(X_scaled, y_train_data_target.iloc[:, 1])  # Adjust if necessary

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_train_data_target.iloc[:, 1], test_size=0.2, random_state=42)

# Model Training and Optimization

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
print(f'Linear Regression Mean Squared Error: {linear_mse}')

# 2. Random Forest Regressor with GridSearchCV for parameter optimization
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
y_pred_rf = rf_grid_search.best_estimator_.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {rf_mse}')
print(f'Best Parameters for Random Forest: {rf_grid_search.best_params_}')

# 3. MLP Regressor with GridSearchCV for parameter optimization
mlp_model = MLPRegressor(max_iter=2000, random_state=42)
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}
mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid_mlp, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
mlp_grid_search.fit(X_train, y_train)
y_pred_mlp = mlp_grid_search.best_estimator_.predict(X_test)
mlp_mse = mean_squared_error(y_test, y_pred_mlp)
print(f'MLP Mean Squared Error: {mlp_mse}')
print(f'Best Parameters for MLP: {mlp_grid_search.best_params_}')



Non-numeric columns: ['ID']
Linear Regression Mean Squared Error: 0.03479467208931874
Random Forest Mean Squared Error: 0.016529984126529982
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
MLP Mean Squared Error: 0.020714333557844567
Best Parameters for MLP: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'}
