In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Data preprocessing
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)

# Encode target variable
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])
data['RainTomorrow'] = le.fit_transform(data['RainTomorrow'])

# Selecting features and target variable
X = data.drop(columns=['Location', 'RainTomorrow'])  # Drop columns not needed for prediction
y = data['RainTomorrow']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define individual models
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
xgboost_clf = XGBClassifier(random_state=42)
ada_boost_clf = AdaBoostClassifier(random_state=42)
lgbm_clf = LGBMClassifier(random_state=42)

# Define the ensemble model using VotingClassifier
ensemble_model = VotingClassifier(estimators=[('mlp', mlp_clf),
                                              ('xgboost', xgboost_clf),
                                              ('ada_boost', ada_boost_clf),
                                              ('lgbm', lgbm_clf)],
                                  voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine'])

# Remove rows with missing values
data = data.dropna()

# Encode categorical variables
le = LabelEncoder()
data['Location'] = le.fit_transform(data['Location'])
data['WindGustDir'] = le.fit_transform(data['WindGustDir'])
data['WindDir9am'] = le.fit_transform(data['WindDir9am'])
data['WindDir3pm'] = le.fit_transform(data['WindDir3pm'])
data['RainToday'] = le.fit_transform(data['RainToday'])

# Select features and target variable
X = data[['WindSpeed3pm', 'WindSpeed9am', 'WindGustDir', 'WindDir9am', 'WindDir3pm']]
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine'])

# Remove rows with missing values
data = data.dropna()

# Encode categorical variables
le = LabelEncoder()
data['Location'] = le.fit_transform(data['Location'])
data['WindGustDir'] = le.fit_transform(data['WindGustDir'])
data['WindDir9am'] = le.fit_transform(data['WindDir9am'])
data['WindDir3pm'] = le.fit_transform(data['WindDir3pm'])
data['RainToday'] = le.fit_transform(data['RainToday'])

# Select features and target variable
X = data[['WindSpeed3pm', 'WindSpeed9am', 'WindGustDir', 'WindDir9am', 'WindDir3pm']]
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gaussian Naive Bayes": GaussianNB(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    # Add more models as needed
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

# Display results
print("Model\t\t\tAccuracy")
print("-" * 30)
for name, accuracy in results.items():
    print(f"{name}\t\t{accuracy:.4f}")


In [None]:
w

In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/weatherdatabangladesh/Weather_Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Date])

# Remove rows with missing values
data = data.dropna()

# Replace values in columns
replacement_dict = {
    'WindGustDir': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5, 'NNE': 22.5},
    'WindDir9am': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5, 'NNE': 22.5},
    'WindDir3pm': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5, 'NNE': 22.5}
}

# Replace the values
for column, replacements in replacement_dict.items():
    data[column].replace(replacements, inplace=True)

# Encode categorical variables
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/weatherdatabangladesh/Weather_Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine'])

# Replace values in columns
replacement_dict = {
    'WindGustDir': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir9am': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir3pm': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5}
}

for column, replacements in replacement_dict.items():
    data[column].replace(replacements, inplace=True)

# Treat remaining missing values
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric
    data[column].fillna(data[column].mean(), inplace=True)  # Replace missing values with mean

# Encode categorical variables
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])

# Encode 'Location' column with one-hot encoding
data = pd.get_dummies(data, columns=['Location'], drop_first=True)

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine'])

# Replace values in columns
replacement_dict = {
    'WindGustDir': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir9am': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir3pm': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5}
}

for column, replacements in replacement_dict.items():
    data[column].replace(replacements, inplace=True)

# Treat remaining missing values
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric
    data[column].fillna(data[column].mean(), inplace=True)  # Replace missing values with mean

# Encode categorical variables
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/weatherdatabangladesh/Weather_Data.csv")  # Replace with the actual file path

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine'])

# Replace values in columns
replacement_dict = {
    'WindGustDir': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir9am': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5},
    'WindDir3pm': {'W': 270, 'SE': 135, 'E': 90, 'WSW': 237.5, 'SW': 225, 'SSE': 157.5}
}

for column, replacements in replacement_dict.items():
    data[column].replace(replacements, inplace=True)

# Treat remaining missing values
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric
    data[column].fillna(data[column].mean(), inplace=True)  # Replace missing values with mean

# Encode categorical variables
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Check for NaN values
print("NaN values in dataset:", data.isna().sum())

# Drop rows with NaN values
data = data.dropna()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Drop the 'Row_ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Handling categorical countries
# If the 'Country' column contains categorical values with 5 types, you can encode them using LabelEncoder
# if 'Country' in data.columns:
#     data['Country'] = le.fit_transform(data['Country'])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 17548, number of negative: 62064
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2195
[LightGBM] [Info] Number of data points in the train set: 79612, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220419 -> initscore=-1.263226
[LightGBM] [Info] Start training from score -1.263226


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9967343247588425

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15396
           1       1.00      0.99      0.99      4508

    accuracy                           1.00     19904
   macro avg       1.00      0.99      1.00     19904
weighted avg       1.00      1.00      1.00     19904



In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load the training and test datasets
train_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")
test_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Test Data.csv")

# Drop the 'row ID' column from the training data
train_data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
train_data = train_data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = train_data.mode().iloc[0]

# Replace NA values with mode in both training and test datasets
train_data.fillna(mode_values, inplace=True)
test_data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])

# Select features and target variable in training data
X_train = train_data.drop(columns=['RainToday'])
y_train = train_data['RainToday']

# Select features and target variable in test data
X_test = test_data.drop(columns=['RainToday'])
y_test_ground_truth = test_data['RainToday']  # Ground truth labels for the test data

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict on the test data
y_pred = ensemble_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_ground_truth, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test_ground_truth, y_pred))


[LightGBM] [Info] Number of positive: 22056, number of negative: 77460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2200
[LightGBM] [Info] Number of data points in the train set: 99516, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221633 -> initscore=-1.256177
[LightGBM] [Info] Start training from score -1.256177


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Evaporation
- Location
- Sunshine
- row ID
Feature names seen at fit time, yet now missing:
- RainTomorrow


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load the training and test datasets
train_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")
test_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Test Data.csv")

# Drop the 'row ID' column from the training data
train_data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
train_data = train_data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column in the training data
mode_values = train_data.mode().iloc[0]

# Replace NA values with mode in both training and test datasets
train_data.fillna(mode_values, inplace=True)
test_data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])

# Select features and target variable in training data
X_train = train_data.drop(columns=['RainToday'])
y_train = train_data['RainToday']

# Select features and target variable in test data
X_test = test_data.drop(columns=['RainToday'])
y_test_ground_truth = test_data['RainToday']  # Ground truth labels for the test data

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict on the test data
y_pred = ensemble_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_ground_truth, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test_ground_truth, y_pred))


[LightGBM] [Info] Number of positive: 22056, number of negative: 77460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2200
[LightGBM] [Info] Number of data points in the train set: 99516, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221633 -> initscore=-1.256177
[LightGBM] [Info] Start training from score -1.256177


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Evaporation
- Location
- Sunshine
- row ID
Feature names seen at fit time, yet now missing:
- RainTomorrow


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer

# Load the training and test datasets
train_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")
test_data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Test Data.csv")

# Drop unnecessary columns
drop_columns = ['Evaporation', 'Sunshine', 'Location', 'row ID']
train_data.drop(columns=drop_columns, inplace=True)
test_data.drop(columns=drop_columns, inplace=True)

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']:
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])

# Select features and target variable in training data
X_train = train_data.drop(columns=['RainToday'])
y_train = train_data['RainToday']

# Select features and target variable in test data
X_test = test_data.drop(columns=['RainToday'])
y_test_ground_truth = test_data['RainToday']  # Ground truth labels for the test data

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                     'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
                     'Temp9am', 'Temp3pm']
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict on the test data
y_pred = ensemble_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_ground_truth, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test_ground_truth, y_pred))


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- RainTomorrow


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Test Data.csv")  # Replace with the actual file path

# Drop the 'Row_ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Handling categorical countries
# If the 'Country' column contains categorical values with 5 types, you can encode them using LabelEncoder
# if 'Country' in data.columns:
#     data['Country'] = le.fit_transform(data['Country'])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("AdaBoost", AdaBoostClassifier(random_state=42)),
    ("LGBM", LGBMClassifier(random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(random_state=42)),
    ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 7458, number of negative: 26683
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2179
[LightGBM] [Info] Number of data points in the train set: 34141, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.218447 -> initscore=-1.274739
[LightGBM] [Info] Start training from score -1.274739


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 100.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6595
           1       1.00      1.00      1.00      1941

    accuracy                           1.00      8536
   macro avg       1.00      1.00      1.00      8536
weighted avg       1.00      1.00      1.00      8536



In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
data = data.head(70000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
#     ("AdaBoost", AdaBoostClassifier(random_state=42)),
#     ("LGBM", LGBMClassifier(random_state=42)),
#     ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
#     ("XGBoost", XGBClassifier(random_state=42)),
#     ("SVM", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

# Create the ensemble model
ensemble_model = VotingClassifier(estimators=base_models, voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9857142857142858

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     10876
           1       0.98      0.96      0.97      3124

    accuracy                           0.99     14000
   macro avg       0.98      0.98      0.98     14000
weighted avg       0.99      0.99      0.99     14000



In [14]:
import pandas as pd

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
data = data.head(10000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Implement the Find-S algorithm
def find_s_algorithm(X, y):
    hypothesis = None
    for i in range(len(X)):
        if y[i] == 1:
            if hypothesis is None:
                hypothesis = X.iloc[i]
            else:
                for j in range(len(X.columns)):
                    if hypothesis[j] != X.iloc[i][j]:
                        hypothesis[j] = '?'
    return hypothesis

# Apply the Find-S algorithm
hypothesis = find_s_algorithm(X, y)

# Predict using the hypothesis
y_pred = []
for i in range(len(X)):
    y_pred.append(1 if all(hypothesis[j] == X.iloc[i][j] or hypothesis[j] == '?' for j in range(len(X.columns))) else 0)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y, y_pred))


  if hypothesis[j] != X.iloc[i][j]:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hypothesis[j] = '?'
  hypothesis[j] = '?'
  hypothesis[j] = '?'
  y_pred.append(1 if all(hypothesis[j] == X.iloc[i][j] or hypothesis[j] == '?' for j in range(len(X.columns))) else 0)


Accuracy: 0.1936

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      8064
           1       0.19      1.00      0.32      1936

    accuracy                           0.19     10000
   macro avg       0.10      0.50      0.16     10000
weighted avg       0.04      0.19      0.06     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
data = data.head(10000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("Naive Bayes", GaussianNB()),
    ("Random Forest", RandomForestClassifier(random_state=42))
]

# Evaluate each model
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy:", accuracy)
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))


Decision Tree Accuracy: 1.0

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1595
           1       1.00      1.00      1.00       405

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

KNN Accuracy: 0.8905

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93      1595
           1       0.83      0.57      0.68       405

    accuracy                           0.89      2000
   macro avg       0.87      0.77      0.81      2000
weighted avg       0.89      0.89      0.88      2000

Naive Bayes Accuracy: 0.9565

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      1595
           1       0.82      1.00      0.90       405


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
# data = data.head(90000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42))
]

# Evaluate each model
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy:", accuracy)
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.9838888888888889

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     13825
           1       0.98      0.95      0.96      4175

    accuracy                           0.98     18000
   macro avg       0.98      0.97      0.98     18000
weighted avg       0.98      0.98      0.98     18000

KNN Accuracy: 0.8832777777777778

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93     13825
           1       0.84      0.61      0.71      4175

    accuracy                           0.88     18000
   macro avg       0.87      0.79      0.82     18000
weighted avg       0.88      0.88      0.88     18000

Decision Tree Accuracy: 1.0

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13825
           1  

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
# data = data.head(90000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Linear Regression", LinearRegression())
]

# Evaluate each model
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Convert regression predictions to binary classes for comparison
    y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]
    accuracy = accuracy_score(y_test, y_pred_binary)
    print(f"{name} Accuracy:", accuracy)
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred_binary))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.9891981511254019

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     15396
           1       0.98      0.97      0.98      4508

    accuracy                           0.99     19904
   macro avg       0.99      0.98      0.98     19904
weighted avg       0.99      0.99      0.99     19904

KNN Accuracy: 0.8869573954983923

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93     15396
           1       0.84      0.62      0.71      4508

    accuracy                           0.89     19904
   macro avg       0.87      0.79      0.82     19904
weighted avg       0.88      0.89      0.88     19904

Decision Tree Accuracy: 1.0

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15396
           1  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Load the dataset
data = pd.read_csv("/kaggle/input/australia-weather-data/Weather Training Data.csv")  # Replace with the actual file path

# Trim the dataset to 10,000 rows
# data = data.head(90000)

# Drop the 'row ID' column
data.drop(columns=['row ID'], inplace=True)

# Drop columns not needed for prediction
data = data.drop(columns=['Evaporation', 'Sunshine', 'Location'])

# Calculate mode for each column
mode_values = data.mode().iloc[0]

# Replace NA values with mode
data.fillna(mode_values, inplace=True)

# Convert categorical columns to numerical using LabelEncoder
le = LabelEncoder()
for column in ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']:
    data[column] = le.fit_transform(data[column])

# Select features and target variable
X = data.drop(columns=['RainToday'])
y = data['RainToday']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
base_models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("KNN", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Naive Bayes", GaussianNB()),
    ("SVM", SVC(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42))
]

# Evaluate each model
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy:", accuracy)
    print(f"\n{name} Classification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.9891981511254019

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     15396
           1       0.98      0.97      0.98      4508

    accuracy                           0.99     19904
   macro avg       0.99      0.98      0.98     19904
weighted avg       0.99      0.99      0.99     19904

KNN Accuracy: 0.8869573954983923

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93     15396
           1       0.84      0.62      0.71      4508

    accuracy                           0.89     19904
   macro avg       0.87      0.79      0.82     19904
weighted avg       0.88      0.89      0.88     19904

Decision Tree Accuracy: 1.0

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     15396
           1  