In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/AI_Project/weatherAUS.csv')

In [None]:
dataset


In [None]:
dataset.info()

Remove features with missing values

In [None]:
# check missing values
dataset.isnull().any().any()

In [None]:
label_encoder = preprocessing.LabelEncoder()
dataset['Date']= label_encoder.fit_transform(dataset['Date'])
dataset['Location']= label_encoder.fit_transform(dataset['Location'])
dataset['WindGustDir']= label_encoder.fit_transform(dataset['WindGustDir'])
dataset['WindDir9am']= label_encoder.fit_transform(dataset['WindDir9am'])
dataset['WindDir3pm']= label_encoder.fit_transform(dataset['WindDir3pm'])
dataset['RainToday']= label_encoder.fit_transform(dataset['RainToday'])
dataset['RainTomorrow']= label_encoder.fit_transform(dataset['RainTomorrow'])

In [None]:
dataset

Deal With missing **values**

In [None]:
from sklearn.impute import SimpleImputer

# Numerical Features
numerical_features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
                      'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Cloud9am' ,'Cloud3pm', 'Temp3pm']

# Impute missing values in numerical features with the mean
imputer = SimpleImputer(strategy='mean')
dataset[numerical_features] = imputer.fit_transform(dataset[numerical_features])

# Categorical Features
categorical_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

# Impute missing values in categorical features with the most frequent category (mode)
imputer = SimpleImputer(strategy='most_frequent')
dataset[categorical_features] = imputer.fit_transform(dataset[categorical_features])

# Binary Features (RainToday and RainTomorrow)
binary_features = ['RainToday', 'RainTomorrow']

# Impute missing values in binary features with the most frequent category (0 or 1)
imputer = SimpleImputer(strategy='most_frequent')
dataset[binary_features] = imputer.fit_transform(dataset[binary_features])


In [None]:
dataset.info()

In [None]:
X = dataset.loc[:,dataset.columns != 'RainTomorrow']
y = dataset['RainTomorrow']

In [None]:
dataset.corr()

In [None]:
corrmat=dataset.corr()
f,ax=plt.subplots(figsize=(15,15))
sns.heatmap(corrmat,ax=ax,cmap="Reds",linewidth=0.5,annot=True)

In [None]:
dataset.info()

In [None]:
dataset.info

In [None]:
#split target and attribute
x = dataset.iloc[:,0:10]
y = dataset.iloc[:,10]

#split train n test dataset
X_train, X_test, y_train, y_test = train_test_split(x,y, random_state=0, test_size=0.2)
print(len(y_test))
print(len(X_train))
print(len(dataset))

print(x)
print(y)

Normalization

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
dataset.info()

Remove features with low variance

In [None]:
sel = feature_selection.VarianceThreshold()
train_variance_x = sel.fit_transform(X_train)
test_variance_x = sel.fit_transform(X_test)
train_variance_x.shape

In [None]:
test_variance_x.shape

Univariate feature selection

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

bestfeatures = SelectKBest(score_func=chi2, k=5)
select = bestfeatures.fit(X_train_scaled, y_train)
X_train_selected = select.transform(X_train_scaled)

print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

Which feature we have choose

In [None]:
mask= select.get_support()
mask

lets plot the features we selected and compare them to the features we've ignored

In [None]:
# Define the colormap (e.g., 'gray_r')
cmap = 'gray_r'
# Assuming 'mask' is already defined
plt.matshow(mask.reshape(1, -1), cmap=cmap)  # Note the correct syntax: cmap=cmap
plt.xlabel("sample index")
plt.yticks(())


Recursive feature elimination

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=0)  # You can adjust the parameters as needed
# Feature extraction using RFE with the RandomForestClassifier
rfe = RFE(rfc, n_features_to_select=100)

# Fit on the train set
fit = rfe.fit(X_train, y_train)

# Transform the train set based on selected features
recursive_features = fit.transform(X_train)


now finally to prove that the feature selection have positive impact on our mode accuracy Let's compare the accuracy between a model has fitted all the feature and one has fitted only the selected ones

In [None]:
X_test_selected = select.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)
print("Score with all features: {:.3f}".format(model.score(X_test, y_test)))

model.fit(X_train_selected, y_train)
print("Score with selected features: {:.3f}".format(model.score(X_test_selected, y_test)))

**Feature Importance**

Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.
You sent
Feature importance is an inbuilt class that comes with Tree Based Classifiers

In [None]:
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
model.feature_importances_

In [None]:
feat_importances = pd.Series(model.feature_importances_)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

we see that the features 11,9,3 has the higest importaznce among all the features we have & that's important because now we have a clear idea of what features to keep and what features to eliminate.

**Model-Based Feature Selection**

 here when we use this meta transformer , we have to specify which model we want to use. For this we are going to use Random Forest and the threshol value to use for feature selection which defines which features should be kept

In [None]:
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=50), threshold='median')

# Fit the feature selector on the training data
select.fit(X_train, y_train)

# Transform both the training and test data to select the important features
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

In [None]:
mask = select.get_support()

plt.matshow(mask.reshape(1,-1),cmap="gray_r")
plt.xlabel("Sample index")
plt.yticks(())
plt.show()

In [None]:
X_test_selected = select.transform(X_test)

model = RandomForestClassifier(n_estimators=100,random_state=42)

model.fit(X_train,y_train)
print("Score with all features: {:.3f}".format(model.score(X_test,y_test)))

model.fit(X_train_selected,y_train)
print("Score with selected features: {:.3f}".format(model.score(X_test_selected,y_test)))

Recursive Feature Eleimination

In [None]:
from sklearn.feature_selection import RFE

select = RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=6)
select.fit(X_train,y_train)
X_train_selected = select.transform(X_train)

print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

In [None]:
mask = select.get_support()
plt.matshow(mask.reshape(1,-1),cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())
plt.show()

In [None]:
X_test_selected = select.transform(X_test)

model.fit(X_train,y_train)
print("Scoree with all features: {:,.3f}".format(model.score(X_test,y_test)))

model.fit(X_train_selected,y_train)
print("Scoree with selected features: {:,.3f}".format(model.score(X_test_selected,y_test)))