In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
sub = pd.read_csv('sample-submission.csv')
train_df.head()

Unnamed: 0,year,month,day,hour,minute,lat,lon,intensity,size,distance,Storm_NosyBe_1h,Storm_NosyBe_3h
0,2004,1,19,10,30,-13.6126,48.2281,468,1422,10.44,0,1
1,2004,1,19,10,45,-13.7039,48.2598,488,1881,13.34,0,1
2,2004,1,19,11,0,-13.7953,48.2918,424,1746,16.28,0,1
3,2004,1,19,11,15,-14.219,48.3387,485,4167,30.41,0,1
4,2004,1,19,11,30,-13.2851,49.2745,155,207,19.1,0,1


In [None]:
!pip install geopy




In [None]:
from geopy.distance import geodesic

def create_time_features(df):
    # Convert time columns to datetime
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']])

    # Extract time features
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)

    return df

# Create spatial features
def create_spatial_features(df):
    # Calculate intensity density
    df['intensity_density'] = df['intensity'] / df['size']

    # Calculate storm proximity indicator
    df['storm_proximity'] = 1 / (df['distance'] + 1)  # Add 1 to avoid division by zero

    return df

def extra_features(df):
    # Calculate cumulative intensity and size
    df['cumulative_intensity'] = df['intensity'].cumsum()
    df['cumulative_size'] = df['size'].cumsum()

    # Shift the lat and lon columns to get the previous positions
    df['previous_lat'] = df['lat'].shift(1)
    df['previous_lon'] = df['lon'].shift(1)

    # Calculate the distance travelled
    df['distance_travelled'] = df.apply(
        lambda row: geodesic((row['lat'], row['lon']), (row['previous_lat'], row['previous_lon'])).km
        if pd.notna(row['previous_lat']) and pd.notna(row['previous_lon'])
        else 0, axis=1
        )
    df['log_intensity'] = df['intensity'].apply(lambda x: np.log(x + 1))  # Log transform intensity
    df['log_size'] = df['size'].apply(lambda x: np.log(x + 1))  # Log transform size
    df['rolling_intensity_mean_1h'] = df['intensity'].rolling(window=2).mean()  # Example for 1-hour rolling mean
    df['rolling_size_mean_1h'] = df['size'].rolling(window=3).mean()  # Example for 1-hour rolling mean
    # Calculate first-order difference
    df['Difference_intensity'] = df['intensity'].diff(2)
    df['Difference_size'] = df['size'].diff(2)
    df['Difference_intensity_shift'] = df['intensity'].shift(2).diff()
    df['Difference_size_shift'] = df['size'].shift(2).diff()
    df['time_since_last_storm'] = df['datetime'].diff().dt.total_seconds() / 3600  # Time difference in hours
    df['storm_speed_kmh'] = df['distance_travelled'] / df['time_since_last_storm']  # Calculate speed if you have time_diff in hours
    df['storm_severity'] = pd.cut(df['intensity'], bins=[0, 250, 500, 750, 1000], labels=[0, 1, 2, 3])
    df['lagged_intensity'] = df['intensity'].shift(1)
    df['lagged_size'] = df['size'].shift(1)
    df["distance_intensity"]=df["distance"]*df["intensity"]
    df["size_distance"]=df["distance"]*df["size"]
    df["size_distance_intensity"]=df["distance"]*df["size"]*df["intensity"]


    # df["rolling_std_2"] = df['intensity'].rolling(window=2).std()

    # df['Bollinger_Upper'] = df["rolling_intensity_mean_1h"] + (2 * df.rolling_std_2)
    # df['Bollinger_Lower'] = df["rolling_intensity_mean_1h"] - (2 * df.rolling_std_2)
    from statsmodels.tsa.seasonal import seasonal_decompose

    # Perform seasonal decomposition
    result = seasonal_decompose(df['intensity'], model='additive', period=3)

    # Extract components
    df['Trend'] = result.trend
    df['Seasonal'] = result.seasonal
    df['Residual'] = result.resid
    return df


# Apply feature engineering
train_df = create_time_features(train_df)
test_df = create_time_features(test_df)

train_df = create_spatial_features(train_df)
test_df = create_spatial_features(test_df)

train_df = extra_features(train_df)
test_df = extra_features(test_df)

In [None]:
def create_storm_features(df):
    # Nosy Be Specific Cyclone Season (November to April)
    # Peak season is typically January-March
    df['is_peak_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
    df['is_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [11, 12, 1, 2, 3, 4] else 0)

    # Assign weights to months based on historical cyclone data
    cyclone_weights = {1: 0.9, 2: 0.8, 3: 0.7, 4: 0.5, 11: 0.6, 12: 0.7}
    df['cyclone_season_weight'] = df['month'].map(cyclone_weights).fillna(0)

    # Define day as 6 AM to 6 PM
    df['is_daytime'] = df['hour'].apply(lambda x: 1 if 6 <= x < 18 else 0)

    df['cyclone_daytime_interaction'] = df['is_cyclone_season'] * df['is_daytime']
    df['peak_cyclone_daytime_interaction'] = df['is_peak_cyclone_season'] * df['is_daytime']

    return df

In [None]:
train_df = create_storm_features(train_df)
test_df = create_storm_features(test_df)

In [None]:
# Prepare Training data
# Select features
feature_cols = [
    'hour_sin', 'hour_cos', 'day_of_week',
    'lat', 'lon', 'intensity', 'size', 'distance',"distance_travelled","size_distance",
    'intensity_density', 'storm_proximity',"storm_speed_kmh","distance_intensity",
    'is_peak_cyclone_season', 'is_cyclone_season', 'is_daytime','Trend', 'Seasonal', 'Residual',
    'cyclone_season_weight', 'cyclone_daytime_interaction', 'peak_cyclone_daytime_interaction',"Difference_size","Difference_intensity_shift",
    'storm_severity',"rolling_intensity_mean_1h","lagged_intensity","lagged_size","rolling_size_mean_1h","Difference_intensity"]

# Prepare labels for 1h and 3h
X = train_df[feature_cols]
y_1h = train_df['Storm_NosyBe_1h']
y_3h = train_df['Storm_NosyBe_3h']

# Split training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y1h_train, y1h_val = train_test_split(X, y_1h, test_size=0.2, random_state=42)
_, _, y3h_train, y3h_val = train_test_split(X, y_3h, test_size=0.2, random_state=42)

In [None]:
from lightgbm import LGBMClassifier, early_stopping
from sklearn.metrics import roc_auc_score

# 1-hour prediction model
model_1h = LGBMClassifier(n_estimators=1000, learning_rate=0.003, num_leaves=29, random_state=21, verbose=-1)

# 3-hour prediction model
model_3h = LGBMClassifier(n_estimators=1000, learning_rate=0.003, num_leaves=29, random_state=21, verbose=-1)

# Train models
model_1h.fit(X_train, y1h_train,
             eval_set=[(X_val, y1h_val)],
             callbacks=[early_stopping(stopping_rounds=50)])

model_3h.fit(X_train, y3h_train,
             eval_set=[(X_val, y3h_val)],
             callbacks=[early_stopping(stopping_rounds=50)])

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.144161
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.173382


In [None]:
# Prepare test features
X_test = test_df[feature_cols]

# Predict probabilities
pred_1h = model_1h.predict_proba(X_test)[:, 1]
pred_3h = model_3h.predict_proba(X_test)[:, 1]

# Create submission file
submission = pd.DataFrame({
    'storm_id': test_df['storm_id'],
    'Storm_NosyBe_1h': pred_1h,
    'Storm_NosyBe_3h': pred_3h
})

submission.to_csv('submission.csv', index=False)