In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [2]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [3]:
train_df.sample(5)

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type,efficiency
10327,10327,32.320412,410.516186,23.28572248452914,25.621156,2.0,0.836774,9.398363,1.782237,33.787534,96.961527,0.2757858107796695,1016.9638811536728,C3,,,0.463632
19656,19656,18.380519,249.947844,94.09212905578138,27.319468,5.0,0.473711,9.367055,,18.950701,49.411542,8.452200929393879,1018.8931654682063,B2,E01,tracking,0.377699
11000,11000,25.581712,521.058426,41.86538052295218,20.158103,2.0,0.545812,2.524424,0.057925,32.480799,21.72604,4.314422163831419,1012.58070621811,A1,,dual-axis,0.504014
8274,8274,33.120379,782.734449,16.155484259660003,0.645231,3.0,0.920842,0.0,1.482355,32.892207,53.072578,6.590885508434157,1002.7298337168144,C3,,tracking,0.738987
9837,9837,42.853284,404.515821,71.23726160877811,9.184269,4.0,0.568689,10.644852,2.581581,49.834236,13.164535,5.224757619270968,1001.7806812033214,D4,,fixed,0.475121


In [4]:
test_df.sample(5)

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type
5382,5382,3.119197,,34.0911998772063,28.59428,4.0,0.600098,29.802125,1.881209,10.61853,3.424474,13.147257361819932,1011.9320813789836,D4,E00,
5868,5868,,717.132502,87.15956761944075,26.207789,7.0,0.920271,37.841613,0.614671,53.716569,67.271249,13.064102331222305,1008.467927242124,B2,E02,
6725,6725,2.868473,326.2902,12.745255899794028,2.410858,5.0,0.914829,0.0,0.569378,5.300209,40.715933,0.6075123283196882,1015.3997742048202,C3,E00,fixed
4013,4013,12.129959,316.707311,28.762001929939995,29.856782,1.0,0.926573,0.0,1.676393,15.539193,22.820544,5.834279252030089,1030.5058677417828,A1,E00,
3362,3362,24.12854,246.637548,95.48007221721504,30.400709,7.0,0.658458,0.0,0.309681,27.282346,18.430215,12.049373505366686,1015.2941676026468,A1,E02,


In [5]:
train_df.shape

(20000, 17)

In [6]:
test_df.shape

(12000, 16)

In [7]:
train_df.drop([ 'id', 'string_id', 'error_code', 'installation_type'], axis='columns', inplace=True)

In [8]:
train_df.sample(5)

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,efficiency
4811,40.368088,403.329089,33.06915722900163,16.28834,3.0,0.869125,0.490867,2.684397,47.991489,55.900067,7.184421631500895,1009.1067943275722,0.560554
16734,,278.282592,43.49667873446712,19.812033,5.0,,17.71082,0.199153,37.455644,39.469428,0.608826611483978,1010.0273310560568,0.442141
19779,21.334369,425.592645,27.82281831355101,21.034497,4.0,0.519598,29.859889,0.793821,26.457094,82.167286,3.713400479235776,1021.6572970320292,0.545451
9846,23.08251,829.109987,46.04292087546935,20.627913,8.0,0.967687,0.0,1.45714,23.892435,34.321749,10.313796656522976,1016.9483490659176,0.762859
5889,,441.300151,64.38783646206072,19.098587,1.0,0.493791,8.417732,0.447656,35.009403,15.713789,8.190108212000942,992.159757322554,0.454051


In [9]:
test_df.drop(['string_id', 'error_code', 'installation_type'], axis='columns', inplace=True)

In [10]:
train_df.shape

(20000, 13)

In [11]:
test_df.shape

(12000, 13)

In [12]:
train_df.dtypes

temperature           float64
irradiance            float64
humidity               object
panel_age             float64
maintenance_count     float64
soiling_ratio         float64
voltage               float64
current               float64
module_temperature    float64
cloud_coverage        float64
wind_speed             object
pressure               object
efficiency            float64
dtype: object

In [13]:
train_df.humidity = pd.to_numeric(train_df.humidity, errors='coerce')
train_df.wind_speed = pd.to_numeric(train_df.wind_speed, errors='coerce')
train_df.pressure = pd.to_numeric(train_df.pressure, errors='coerce')

In [14]:
train_df.dtypes

temperature           float64
irradiance            float64
humidity              float64
panel_age             float64
maintenance_count     float64
soiling_ratio         float64
voltage               float64
current               float64
module_temperature    float64
cloud_coverage        float64
wind_speed            float64
pressure              float64
efficiency            float64
dtype: object

In [15]:
train_df.sample(5)

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,efficiency
514,30.588913,697.70153,99.079574,32.547846,4.0,0.794684,34.474684,1.928077,39.075323,73.657761,3.975797,1024.982628,0.560241
3109,19.474224,296.756494,45.786384,11.765688,5.0,0.595467,0.0,0.473531,25.06258,45.426719,14.817278,1011.297486,0.447228
10582,22.047367,360.896463,70.443605,16.694806,4.0,0.896409,0.0,1.955073,25.526172,97.093726,5.100277,1026.314893,0.511096
14071,39.19533,577.127955,69.358736,25.067758,,0.586548,,1.871685,39.621169,60.039053,5.373991,1013.328057,0.452333
6288,0.0,,25.836556,8.91916,4.0,0.615713,37.447667,2.365905,5.029485,93.004245,2.59771,1029.566102,0.509542


In [16]:
train_df.fillna(train_df.mean(), inplace=True)

In [17]:
train_df.shape

(20000, 13)

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train_df)
scaled_df = pd.DataFrame(scaled_data, columns=train_df.columns)

print(scaled_df.head())

   temperature  irradiance  humidity  panel_age  maintenance_count  \
0     0.053037    0.549606  0.412388   0.918197           0.266667   
1     0.168159    0.392153  0.013491   0.570796           0.533333   
2     0.316517    0.601798  0.912688   0.042722           0.266667   
3     0.361884    0.624058  0.961952   0.528338           0.200000   
4     0.037826    0.285477  0.274886   0.877828           0.400000   

   soiling_ratio   voltage   current  module_temperature  cloud_coverage  \
0       0.671974  0.075673  0.268433            0.210633        0.062494   
1       0.132223  0.044192  0.033001            0.423771        0.043851   
2       0.703984  0.097562  0.572992            0.667134        0.051378   
3       0.729209  0.093663  0.131298            0.888007        0.067361   
4       0.252891  0.000000  0.122753            0.104404        0.003632   

   wind_speed  pressure  efficiency  
0    0.855013  0.589274    0.569462  
1    0.800815  0.670906    0.401642  
2    0.1

In [19]:
X = train_df.drop(columns=['efficiency'])
y = train_df['efficiency'] 

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [155]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Create and train XGBoost model
model = xgb.XGBRegressor(
    max_depth=3,
    n_estimators=200,
    learning_rate=0.05,
    subsample=0.8,
    random_state=43
)

model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Calculate custom score
mse = mean_squared_error(y_val, y_pred)
score = 100 * (1 - np.sqrt(mse))

print(f"[XGBoost] Custom Score: {score:.4f}")


[XGBoost] Custom Score: 89.3942


In [156]:
Score = 100*(1-np.sqrt(mean_squared_error(y_val,y_pred)))
Score

89.39421401921513

In [157]:
test_df.humidity = pd.to_numeric(test_df.humidity, errors='coerce')
test_df.wind_speed = pd.to_numeric(test_df.wind_speed, errors='coerce')
test_df.pressure = pd.to_numeric(test_df.pressure, errors='coerce')

In [158]:
test_df.fillna(test_df.mean(), inplace=True)

In [159]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(test_df)
scaled_df = pd.DataFrame(scaled_data, columns=test_df.columns)

print(scaled_df.head())

         id  temperature  irradiance  humidity  panel_age  maintenance_count  \
0  0.000000     0.120773    0.327326  0.908310   0.397343           0.461538   
1  0.000083     0.238733    0.648429  0.209792   0.597640           0.307692   
2  0.000167     0.231540    0.528871  0.556199   0.040983           0.230769   
3  0.000250     0.127394    0.460620  0.490488   0.537415           0.384615   
4  0.000333     0.295071    0.504589  0.087544   0.499202           0.615385   

   soiling_ratio   voltage   current  module_temperature  cloud_coverage  \
0       0.816295  0.015252  0.009514            0.300266        0.033509   
1       0.317251  0.072054  0.236178            0.575715        0.032326   
2       0.352344  0.068053  0.233847            0.494581        0.069612   
3       0.500765  0.018789  0.108474            0.395910        0.042862   
4       0.274857  0.029450  0.257369            0.462125        0.051025   

   wind_speed  pressure  
0    0.478827  0.759928  
1    0.278

In [160]:
train_df.sample(5)

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,efficiency
7358,31.337534,164.311513,23.032944,6.949721,5.0,0.8747,28.400059,0.005076,34.039022,71.087805,14.382933,1013.026725,0.432834
1440,40.01336,333.8435,91.311694,13.959164,4.0,0.831418,11.081944,0.569897,46.533804,58.240355,13.505402,1025.638902,0.471572
5661,31.310217,561.060436,38.983429,13.97617,4.0,0.963592,5.65643,0.548339,29.923807,87.334272,6.842976,1009.387484,0.0
18953,20.261512,109.317498,73.209494,11.655436,4.0,0.704059,0.0,0.939739,26.562341,44.245162,4.137507,1022.636701,0.421833
10213,29.97319,590.702444,37.432768,21.116517,3.0,0.965954,0.0,2.764976,31.274498,51.062462,0.134176,1032.663128,0.650223


In [161]:
X_test = test_df.drop(columns=['id'])

In [162]:
test_df.sample(5)

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure
7994,7994,45.106982,595.1104,83.203749,8.037204,3.0,0.814575,1.949624,2.010322,50.334044,29.931105,3.594764,1010.991316
509,509,25.146375,823.696027,44.041898,7.67155,2.0,0.723024,0.0,2.80885,34.098213,66.781368,0.767494,1010.192628
2352,2352,28.298918,381.26105,40.215742,20.498658,5.0,0.496815,26.072984,0.301714,31.092661,56.659307,2.016336,1024.433111
10069,10069,25.762183,649.410295,75.955745,32.034471,7.0,0.928293,19.795139,0.534303,32.326854,73.170091,7.358067,1013.638372
593,593,24.967661,554.437731,44.525596,12.754341,5.0,0.751808,16.810941,1.352059,30.410996,11.518585,7.373409,1004.2284


In [163]:
X.sample(5)

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure
2104,40.607848,430.080792,97.957911,9.329875,6.0,0.802884,0.0,0.844006,46.076335,51.378575,11.969628,1032.341112
8906,20.999942,749.825925,19.099239,27.266519,3.0,0.49401,66.972959,3.34572,25.152885,4.650411,2.290557,1005.110015
631,20.504068,1137.331988,50.886508,33.118478,5.0,0.886192,41.720358,3.03465,30.929778,70.08176,5.419035,1025.054146
17332,25.077241,672.792134,73.069575,2.961481,5.0,0.720969,23.758208,0.812228,18.327944,1.750633,2.993934,1006.130794
3709,6.81311,430.436558,2.14659,32.672836,7.0,0.815806,0.0,1.30661,12.766164,18.656995,9.310781,1020.612351


In [164]:
X_test.sample(5)

Unnamed: 0,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure
10598,7.310119,233.710244,7.404758,27.883696,4.0,0.722594,0.0,0.815766,15.821477,47.187554,7.519368,1028.701596
2944,29.525479,571.767906,35.179949,21.001102,4.0,0.512886,11.366543,2.381041,33.910899,62.794977,8.772224,1006.882738
7359,50.953494,474.225242,50.188207,7.163178,4.0,0.682765,28.167628,3.134684,61.124819,17.069754,12.958622,1015.064898
6616,50.849644,461.177416,45.489297,27.022513,1.0,0.525391,0.0,0.146455,51.897662,54.333365,11.641402,1020.40948
8146,26.627691,345.071499,74.560238,30.602213,7.0,0.848449,31.112994,1.838251,30.928218,55.740238,5.306279,1001.19532


In [165]:
y_pred = model.predict(X_test)

In [166]:
import pandas as pd

# Assuming `test_df` has the 'id' column
submission = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': y_pred
})

# Check shape before saving
assert submission.shape == (12000, 2), "Submission file must be 12000 x 2"

# Save to CSV
submission.to_csv('submission.csv', index=False)