In [88]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [89]:
# Load datasets
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")

In [90]:
ndvi_columns = [col for col in train_df.columns if col.endswith("_N")]

In [91]:
def fill_missing_values(df, feature_cols):
    """
    Fills missing values in the specified feature columns of a DataFrame
    using linear interpolation along the rows (axis=1).
    """
    df[feature_cols] = df[feature_cols].interpolate(method='linear', axis=1, limit_direction='both')
    
    # If any NaNs remain (e.g., a whole row was NaN), fill them with 0.
    df[feature_cols] = df[feature_cols].fillna(0)
    return df

In [92]:
print(train_df.isnull().sum())

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64


In [93]:
train_df = fill_missing_values(train_df, ndvi_columns)
print(train_df)

      Unnamed: 0     ID       class  20150720_N   20150602_N   20150517_N  \
0              0      1       water    637.5950   658.668000 -1882.030000   
1              1      2       water    634.2400   593.705000 -1625.790000   
2              3      4       water     58.0174 -1599.160000 -1325.895000   
3              4      5       water     72.5180   226.477000   380.436000   
4              7      8       water   1136.4400  1306.903333  1477.366667   
...          ...    ...         ...         ...          ...          ...   
7995       10537  10538  impervious   1207.7000   984.620000  1075.435000   
7996       10538  10539  impervious   2170.3500  1419.720000  1361.000000   
7997       10541  10542  impervious   1895.6800  1454.740000  1244.150000   
7998       10542  10543  impervious   3465.7400  1283.320000   413.412000   
7999       10544  10545  impervious   6941.1900  1667.870000  5084.780000   

      20150501_N  20150415_N  20150330_N  20150314_N  ...  20140610_N  \
0 

In [94]:
print(train_df.isnull().sum())

Unnamed: 0    0
ID            0
class         0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64


In [95]:
min_vals = x_train.min()
max_vals = x_train.max()

In [96]:
x_train = train_df[ndvi_columns]
y_train = train_df['class']
print(y_train)

0            water
1            water
2            water
3            water
4            water
           ...    
7995    impervious
7996    impervious
7997    impervious
7998    impervious
7999    impervious
Name: class, Length: 8000, dtype: object


In [97]:
x_test = test_df[ndvi_columns]

In [98]:
# Apply normalization to training data
x_train = (x_train - min_vals) / (max_vals - min_vals)
    
# Apply the SAME normalization to testing data
x_test = (x_test - min_vals) / (max_vals - min_vals)

In [99]:
model = LogisticRegression( random_state=42)
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [100]:
train_predictions = model.predict(x_train)

In [101]:
training_accuracy = accuracy_score(y_train, train_predictions)
print(training_accuracy)

0.841875


In [102]:
test_predictions = model.predict(x_test)

In [103]:
print(test_predictions)

['forest' 'forest' 'orchard' ... 'water' 'water' 'grass']


In [104]:
output_df = pd.DataFrame({
        'ID': test_df['ID'],
        'class': test_predictions
    })

In [105]:
print(output_df)

        ID    class
0        1   forest
1        2   forest
2        3  orchard
3        4     farm
4        5   forest
...    ...      ...
2840  2841    water
2841  2842    water
2842  2843    water
2843  2844    water
2844  2845    grass

[2845 rows x 2 columns]


In [106]:
output_filename = 'logistic_predicted_land_cover.csv'
output_df.to_csv(output_filename, index=False)

In [107]:
from sklearn.ensemble import RandomForestClassifier

In [108]:
RFmodel = RandomForestClassifier(random_state=42)
RFmodel.fit(x_train, y_train)

In [109]:
rftrain_predictions = RFmodel.predict(x_train)

In [110]:
rftraining_accuracy = accuracy_score(y_train, rftrain_predictions)
print(rftraining_accuracy)

1.0


In [111]:
rftest_predictions = RFmodel.predict(x_test)

In [112]:
rfoutput_df = pd.DataFrame({
        'ID': test_df['ID'],
        'class': rftest_predictions
    })

In [113]:
rfoutput_filename = 'RF_predicted_land_cover.csv'
rfoutput_df.to_csv(rfoutput_filename, index=False)

In [114]:
from sklearn.naive_bayes import GaussianNB

In [115]:
GNBmodel = GaussianNB()
GNBmodel.fit(x_train, y_train)

In [116]:
gnbtrain_predictions = GNBmodel.predict(x_train)

In [117]:
gnbtraining_accuracy = accuracy_score(y_train, gnbtrain_predictions)
print(gnbtraining_accuracy)

0.842625


In [118]:
gnbtest_predictions = GNBmodel.predict(x_test)

In [119]:
gnboutput_df = pd.DataFrame({
        'ID': test_df['ID'],
        'class': gnbtest_predictions
    })

In [120]:
gnboutput_filename = 'GNB_predicted_land_cover.csv'
gnboutput_df.to_csv(gnboutput_filename, index=False)

In [123]:
le = LabelEncoder()
y_train_enc = le.fit_transform(train_df["class"])

In [126]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric='mlogloss'
)
xgb_model.fit(x_train, y_train_enc)

In [127]:
xgbtrain_predictions = xgb_model.predict(x_train)

In [128]:
xgbtraining_accuracy = accuracy_score(y_train_enc, xgbtrain_predictions)
print(xgbtraining_accuracy)

0.9985


In [129]:
xgbtest_predictions_enc = xgb_model.predict(x_test)

In [130]:
xgbtest_predictions=le.inverse_transform(xgbtest_predictions_enc)

In [131]:
print(xgbtest_predictions)

['forest' 'forest' 'forest' ... 'water' 'impervious' 'impervious']


In [132]:
xgboutput_df = pd.DataFrame({
        'ID': test_df['ID'],
        'class': xgbtest_predictions
    })

In [133]:
xgboutput_filename = 'XGB_predicted_land_cover.csv'
xgboutput_df.to_csv(xgboutput_filename, index=False)