In [2]:
import pandas as pd
import numpy as np

# Load the dataset (replace 'forestfires.csv' with your actual file path)
df = pd.read_csv('forestfires.csv')

# Inspect the first few rows to understand the structure
print(df.head())


   X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain  area
0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0
1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0
2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0
3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0
4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0


In [3]:
# Create classes for 'area' (region affected)
def classify_area(area):
    if area == 0:
        return 'NotAffected'
    elif 0 < area <= 5:
        return 'PartiallyAffected'
    else:
        return 'MostlyAffected'

# Apply the classification to the 'area' column
df['AffectedClass'] = df['area'].apply(classify_area)

In [8]:
# Create subsets for each class
not_affected = df[df['AffectedClass'] == 'NotAffected']
partially_affected = df[df['AffectedClass'] == 'PartiallyAffected']
mostly_affected = df[df['AffectedClass'] == 'MostlyAffected']
mostly_affected

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,AffectedClass
179,8,6,aug,tue,88.8,147.3,614.5,9.0,14.4,66,5.4,0.0,5.23,MostlyAffected
180,1,3,sep,sun,92.4,124.1,680.7,8.5,23.9,32,6.7,0.0,5.33,MostlyAffected
181,8,6,oct,mon,84.9,32.8,664.2,3.0,19.1,32,4.0,0.0,5.44,MostlyAffected
182,5,4,feb,sun,86.8,15.6,48.3,3.9,12.4,53,2.2,0.0,6.38,MostlyAffected
183,7,4,oct,mon,91.7,48.5,696.1,11.1,16.8,45,4.5,0.0,6.83,MostlyAffected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,4,3,aug,wed,94.5,139.4,689.1,20.0,28.9,29,4.9,0.0,49.59,MostlyAffected
505,1,2,aug,thu,91.0,163.2,744.4,10.1,26.7,35,1.8,0.0,5.80,MostlyAffected
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44,MostlyAffected
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29,MostlyAffected


In [9]:
# b. Merge two subsets (NotAffected and PartiallyAffected)
merged_df = pd.concat([not_affected, partially_affected], axis=0)
merged_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,AffectedClass
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00,NotAffected
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00,NotAffected
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00,NotAffected
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00,NotAffected
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00,NotAffected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,5,4,aug,tue,95.1,141.3,605.8,17.7,24.1,43,6.3,0.0,2.00,PartiallyAffected
496,4,5,aug,mon,96.2,175.5,661.8,16.8,32.6,26,3.1,0.0,2.77,PartiallyAffected
503,2,4,aug,wed,94.5,139.4,689.1,20.0,29.2,30,4.9,0.0,1.95,PartiallyAffected
509,5,4,aug,fri,91.0,166.9,752.6,7.1,21.1,71,7.6,1.4,2.17,PartiallyAffected


In [10]:
# c. Sort Data using Temperature, Wind, RH, and Area
# Ensure that 'temp', 'wind', 'RH', and 'area' columns are numeric (convert if necessary)
df['temp'] = pd.to_numeric(df['temp'], errors='coerce')
df['wind'] = pd.to_numeric(df['wind'], errors='coerce')
df['RH'] = pd.to_numeric(df['RH'], errors='coerce')
df['area'] = pd.to_numeric(df['area'], errors='coerce')


In [11]:
# Sort by 'temp', 'wind', 'RH', and 'area'
sorted_df = df.sort_values(by=['temp', 'wind', 'RH', 'area'], ascending=[True, True, True, False])
sorted_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,AffectedClass
280,4,6,dec,fri,84.7,26.7,352.6,4.1,2.2,59,4.9,0.0,9.27,MostlyAffected
282,6,3,feb,sun,84.9,27.5,353.5,3.4,4.2,51,4.0,0.0,0.00,NotAffected
465,2,2,feb,sat,79.5,3.6,15.3,1.8,4.6,59,0.9,0.0,6.84,MostlyAffected
463,6,5,feb,tue,75.1,4.4,16.2,1.9,4.6,82,6.3,0.0,5.39,MostlyAffected
278,4,4,dec,mon,85.4,25.4,349.7,2.6,4.6,21,8.5,0.0,22.03,MostlyAffected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,1,3,aug,fri,95.9,158.0,633.6,11.3,32.4,27,2.2,0.0,0.00,NotAffected
491,4,4,aug,thu,95.8,152.0,624.1,13.8,32.4,21,4.5,0.0,0.00,NotAffected
496,4,5,aug,mon,96.2,175.5,661.8,16.8,32.6,26,3.1,0.0,2.77,PartiallyAffected
484,2,5,aug,sun,94.9,130.3,587.1,14.1,33.1,25,4.0,0.0,26.43,MostlyAffected


In [12]:
# d. Transposing Data (taking the first 5 rows as an example)
transposed_df = sorted_df.head().transpose()
transposed_df

Unnamed: 0,280,282,465,463,278
X,4,6,2,6,4
Y,6,3,2,5,4
month,dec,feb,feb,feb,dec
day,fri,sun,sat,tue,mon
FFMC,84.7,84.9,79.5,75.1,85.4
DMC,26.7,27.5,3.6,4.4,25.4
DC,352.6,353.5,15.3,16.2,349.7
ISI,4.1,3.4,1.8,1.9,2.6
temp,2.2,4.2,4.6,4.6,4.6
RH,59,51,59,82,21


In [13]:
# e. Melting Data to long format (example with selected columns)
melted_df = pd.melt(
    df[['month', 'temp', 'wind', 'RH', 'area']],  # Choose relevant columns
    id_vars=['month'],  # Columns that remain unchanged
    var_name='Variable',  # Name for the variable columns
    value_name='Value'  # Name for the values column
)
melted_df

Unnamed: 0,month,Variable,Value
0,mar,temp,8.20
1,oct,temp,18.00
2,oct,temp,14.60
3,mar,temp,8.30
4,mar,temp,11.40
...,...,...,...
2063,aug,area,6.44
2064,aug,area,54.29
2065,aug,area,11.16
2066,aug,area,0.00


In [14]:
# f. Casting data to wide format (pivoting the melted data)
casted_df = melted_df.pivot_table(
    index='month', 
    columns='Variable', 
    values='Value', 
    aggfunc='mean'  # Use mean or another aggregation function
).reset_index()

casted_df

Variable,month,RH,area,temp,wind
0,apr,46.888889,8.891111,12.044444,4.666667
1,aug,45.48913,12.489076,21.631522,4.086413
2,dec,38.444444,13.33,4.522222,7.644444
3,feb,55.7,6.275,9.635,3.755
4,jan,89.0,0.0,5.25,2.0
5,jul,45.125,14.369687,22.109375,3.734375
6,jun,45.117647,5.841176,20.494118,4.135294
7,mar,40.0,4.356667,13.083333,4.968519
8,may,67.0,19.24,14.65,4.45
9,nov,31.0,0.0,11.8,4.5


In [15]:
# Display the results for all operations
print("\n--- Not Affected Subset ---\n", not_affected.head())
print("\n--- Partially Affected Subset ---\n", partially_affected.head())
print("\n--- Merged Data (NotAffected + PartiallyAffected) ---\n", merged_df.head())
print("\n--- Sorted Data (Temperature, Wind, RH, Area) ---\n", sorted_df[['temp', 'wind', 'RH', 'area']].head())
print("\n--- Transposed Data (First 5 rows) ---\n", transposed_df)
print("\n--- Melted Data (Long Format) ---\n", melted_df.head())
print("\n--- Casted Data (Wide Format) ---\n", casted_df)


--- Not Affected Subset ---
    X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain  area  \
0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0   
1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0   
2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0   
3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0   
4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0   

  AffectedClass  
0   NotAffected  
1   NotAffected  
2   NotAffected  
3   NotAffected  
4   NotAffected  

--- Partially Affected Subset ---
      X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain  area  \
138  9  9   jul  tue  85.8   48.3  313.4   3.9  18.0  42   2.7   0.0  0.36   
139  1  4   sep  tue  91.0  129.5  692.6   7.0  21.7  38   2.2   0.0  0.43   
140  2  5   sep  mon  90.9  126.5  686.5   7.0  21.9  39   1.8   0.0  0.47   
141  1  2   aug  wed  95.5   99.9  513.3  13.2  23.3  31   4.5   0.0 