## Dataset Loading and Pre-Processing

In [16]:
import pandas as pd
data = pd.read_csv("/Users/sougataa/Documents/Learners_space/Machine_learning/week2/weather_classification_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


## Data Shuffling

In [17]:
from sklearn.utils import shuffle
data = shuffle(data)
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
1109,41.0,53,3.5,15.0,clear,1027.23,6,Winter,9.5,coastal,Sunny
7697,35.0,27,8.0,11.0,clear,1010.02,11,Spring,8.0,inland,Sunny
4554,31.0,55,14.0,40.0,partly cloudy,1009.3,4,Summer,6.5,inland,Cloudy
1672,19.0,74,7.5,92.0,overcast,993.95,0,Spring,2.0,mountain,Rainy
1706,34.0,57,6.0,39.0,overcast,1007.32,3,Spring,6.5,coastal,Cloudy


## Scaling and Labeling

In [18]:
# START CODE HERE
object_columns = ['Cloud Cover', 'Season', 'Location', 'Weather Type']
non_object_columns = ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']
# END CODE HERE
print(f"Object Columns: {object_columns}\nNon Object Columns: {non_object_columns}")

Object Columns: ['Cloud Cover', 'Season', 'Location', 'Weather Type']
Non Object Columns: ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']


#### TEST

In [19]:
# DON'T EDIT THIS CELL
if (object_columns == ['Cloud Cover', 'Season', 'Location', 'Weather Type'] and non_object_columns == ['Temperature', 'Humidity', 'Wind Speed', 'Precipitation (%)', 'Atmospheric Pressure', 'UV Index', 'Visibility (km)']):
  print(f"\033[32mTest Passed\033[0m")
else:
  print(f"\033[31mTest Failed\033[0m")

[32mTest Passed[0m


#### CODE

In [20]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# START CODE HERE

column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), object_columns),
        ('num', StandardScaler(), non_object_columns)
    ]
)

# Fit and transform the data
data_scaled_labeled = column_transformer.fit_transform(data)
# END CODE HERE

all_columns = object_columns + non_object_columns

# Create the new DataFrame
data_scaled_labeled = pd.DataFrame(data_scaled_labeled, columns=all_columns)

#### TEST

In [21]:
# DON'T EDIT THIS CELL

data_scaled_labeled_check = pd.read_csv("/Users/sougataa/Documents/Learners_space/Machine_learning/week2/data_scaled_labeled_check.csv")
import numpy as np
data_scaled_labeled_values = np.sort(data_scaled_labeled.values,axis=0)
data_scaled_labeled_check_values = np.sort(data_scaled_labeled_check.values,axis=0)
if np.allclose(data_scaled_labeled_values, data_scaled_labeled_check_values, equal_nan=True):
  print(f"\033[32mTest Passed\033[0m")
else:
  print(f"\033[31mTest Failed\033[0m")

[32mTest Passed[0m


## Data Splitting

In [22]:
X = data_scaled_labeled.drop('Weather Type', axis=1)
y = data_scaled_labeled['Weather Type']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=10)

#### TEST

In [23]:
# DON'T EDIT THIS CELL

if (len(X_train)==11880 and len(X_test)==1320):
  print(f"\033[32mTest Passed\033[0m")
else:
  print(f"\033[31mTest Failed\033[0m")

if (len(y_train)==11880 and len(y_test)==1320):
  print(f"\033[32mTest Passed\033[0m")
else:
  print(f"\033[31mTest Failed\033[0m")

[32mTest Passed[0m
[32mTest Passed[0m


In [24]:
X_train.head()

Unnamed: 0,Cloud Cover,Season,Location,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
7017,2.0,3.0,2.0,-1.100192,0.658092,1.109918,1.200664,-0.509915,-1.038715,-0.137308
12278,0.0,3.0,0.0,-0.294931,0.55905,-0.192836,0.950236,0.547933,0.257813,1.494076
2859,2.0,2.0,1.0,0.050181,0.261924,-0.265211,1.200664,0.154634,-0.520104,-0.582231
5495,3.0,2.0,1.0,0.280255,-0.233285,0.386166,1.044147,-0.095916,-0.520104,-0.730539
8469,3.0,0.0,2.0,0.625367,1.499948,1.327043,0.918933,-0.112046,1.813646,0.60423


## Training and Testing

In [25]:
# START CODE HERE

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

# END CODE HERE

print("Accuracy:", score*100)

Accuracy: 91.5909090909091


#### TEST

In [26]:
# DON'T EDIT THIS CELL

if (score>0.90):
  print(f"\033[32mTest Passed\033[0m")
else:
  print(f"\033[31mTest Failed\033[0m")

[32mTest Passed[0m


## Predicting Output (Here, Weather Type) 

In [27]:
input = X_test.head()
input

Unnamed: 0,Cloud Cover,Season,Location,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
11923,2.0,3.0,1.0,-0.927636,-0.431369,-0.627087,0.950236,-0.199953,-1.038715,-1.323769
2331,3.0,3.0,1.0,1.027998,-0.728494,-0.844213,-1.178401,0.364053,1.813646,0.752538
217,2.0,3.0,2.0,0.567848,1.10378,-0.409962,0.292863,0.379107,-0.520104,-0.730539
1760,3.0,0.0,1.0,-0.409968,0.01432,-0.265211,-0.614938,0.138773,-0.001493,0.455922
3963,0.0,3.0,1.0,1.258072,-0.134243,-0.337587,-1.272311,0.281791,1.295035,1.19746


In [30]:
y = model.predict(input)
y.to_list()

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'