The following data is available in both datasets:

    - Elevation — Elevation in meters
    - Aspect — Aspect in degrees azimuth
    - Slope — Slope in degrees
    - Horizontal_Distance_To_Hydrology — Horizontal Distance to nearest surface water features
    - Vertical_Distance_To_Hydrology — Vertical Distance to nearest surface water features
    - Horizontal_Distance_To_Roadways — Horizontal Distance to nearest roadway
    - Hillshade_9am — Hillshade index at 9am, summer solstice
    - Hillshade_Noon — Hillshade index at noon, summer solstice
    - Hillshade_3pm — Hillshade index at 3pm, summer solstice
    - Horizontal_Distance_To_Fire_Points — Horizontal Distance to nearest wildfire ignition points
    - Wilderness_Area — Wilderness area designation
    - Soil_Type — Soil Type designation
    - Cover_Type— Forest Cover Type designation

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')

In [None]:
trees = pd.read_csv("../input/forest-cover-type-dataset/covtype.csv")

In [None]:
trees.shape

In [None]:
trees.head()

In [None]:
trees.info()

### Check for Anomalies & Outliers

In [None]:
def outlier_function(df, col_name):
    ''' this function detects first and third quartile and interquartile range for a given column of a dataframe
    then calculates upper and lower limits to determine outliers conservatively
    returns the number of lower and uper limit and number of outliers respectively
    '''
    first_quartile = np.percentile(np.array(df[col_name].tolist()), 25)
    third_quartile = np.percentile(np.array(df[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
                      
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
                      
    for value in df[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count +=1
    return lower_limit, upper_limit, outlier_count

In [None]:
# loop through all columns to see if there are any outliers
for column in trees.columns:
    if outlier_function(trees, column)[2] > 0:
        print("There are {} outliers in {}".format(outlier_function(trees, column)[2], column))

In [None]:
trees = trees[(trees['Horizontal_Distance_To_Fire_Points'] > outlier_function(trees, 'Horizontal_Distance_To_Fire_Points')[0]) &
              (trees['Horizontal_Distance_To_Fire_Points'] < outlier_function(trees, 'Horizontal_Distance_To_Fire_Points')[1])]
trees.shape

### 2. Exploratory Data Analysis

In [None]:
# list of columns of wilderness areas and soil types
is_binary_columns = [column for column in trees.columns if ("Wilderness" in column) | ("Soil" in column)]
pd.unique(trees[is_binary_columns].values.ravel())

In [None]:
# sum of all widerness area columns
trees["w_sum"] = trees["Wilderness_Area1"] + trees["Wilderness_Area2"] + trees["Wilderness_Area3"] + trees["Wilderness_Area4"]
print(trees.w_sum.value_counts())

In [None]:
# create a list of soil_type columns
soil_columns = [c for c in trees.columns if "Soil" in c]
trees["soil_sum"] = 0

# sum of all soil type columns
for c in soil_columns:
    trees["soil_sum"] += trees[c]

print(trees.soil_sum.value_counts())

In [None]:
trees.Cover_Type.value_counts()

In [None]:
sns.countplot(trees.Cover_Type)
plt.show()

In [None]:

trees['Wilderness_Area_Type'] = (trees.iloc[:, 11:15] == 1).idxmax(1)

#list of wilderness areas
wilderness_areas = sorted(trees['Wilderness_Area_Type'].value_counts().index.tolist())

# distribution of the cover type in different wilderness areas
# figsize(14,10)

# plot cover_type distribution for each wilderness area
for area in wilderness_areas:
    subset = trees[trees['Wilderness_Area_Type'] == area]
    sns.kdeplot(subset["Cover_Type"], label=area, linewidth=2)

# set title, legends and labels
plt.ylabel("Density")
plt.xlabel("Cover_Type")
plt.title("Density of Cover Types Among Different Wilderness Areas", size=14)

### Reverse One Hot Encoding

In [None]:
trees.columns

In [None]:
def split_numbers_chars(row):
    '''This function fetches the numerical characters at the end of a string
    and returns alphabetical character and numerical chaarcters respectively'''
    head = row.rstrip('0123456789')
    tail = row[len(head):]
    return head, tail

def reverse_one_hot_encode(dataframe, start_loc, end_loc, numeric_column_name):
    ''' this function takes the start and end location of the one-hot-encoded column set and numeric column name to be created as arguments
    1) transforms one-hot-encoded columns into one column consisting of column names with string data type
    2) splits string column into the alphabetical and numerical characters
    3) fetches numerical character and creates numeric column in the given dataframe
    '''
    dataframe['String_Column'] = (dataframe.iloc[:, start_loc:end_loc] == 1).idxmax(1)
    dataframe['Tuple_Column'] = dataframe['String_Column'].apply(split_numbers_chars)
    dataframe[numeric_column_name] = dataframe['Tuple_Column'].apply(lambda x: x[1]).astype('int64')
    dataframe.drop(columns=['String_Column','Tuple_Column'], inplace=True)

In [None]:
# reverse_one_hot_encode(trees, 12, 15, "Wilderness_Area")

In [None]:
reverse_one_hot_encode(trees, 14, 55, "Soil_Type")

In [None]:
# store continious variables in a list
continuous_variables = trees.columns[1:11].tolist()

In [None]:
# make a list of numeric features and create a dataframe with them
all_features_w_label = continuous_variables + wilderness_areas + ["Soil_Type"] + ["Cover_Type"]
trees_cleaned = trees[all_features_w_label]

In [None]:
trees_cleaned.head()

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(trees_cleaned.corr(), annot=True)

In [None]:
corr_matrix = abs(trees_cleaned.corr())
print(corr_matrix["Cover_Type"].sort_values(ascending=False))

## Split The Data For Model

In [None]:
X = trees_cleaned.drop("Cover_Type",axis=1)
y = trees_cleaned["Cover_Type"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=1)

In [None]:
print('Training Data Shape:', X_train.shape)
print('Validation Data Shape:', X_valid.shape)
print('Training Label Shape:', y_train.shape)
print('Validation Label Shape:', y_valid.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
# create scaler
scaler = StandardScaler()

# apply normalization to training set and transform training set
X_train_scaled = scaler.fit_transform(X_train, y_train)

# transform validation set
X_valid_scaled = scaler.transform(X_valid)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# function to train a given model, generate predictions, and return accuracy score
def fit_evaluate_model(model, X_train, y_train, X_valid, Y_valid):
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_valid)
    return accuracy_score(y_valid, y_predicted)

In [None]:
# create model apply fit_evaluate_model
from lightgbm import LGBMClassifier
lgbm_classifier = LGBMClassifier()
lgbm_accuracy = fit_evaluate_model(lgbm_classifier, X_train, y_train, X_valid, y_valid)
print("Number of correct predictions made out of all predictions are:", lgbm_accuracy)

In [None]:
y_train

In [None]:
X_train.columns

In [None]:
def predict(Elev,Aspe,Slope,H_Hydrology,V_Hydrology,Roadways,hill9,hillnoon,hill3,firept,area,soil):
    area_index = np.where(X.columns==area)[0][0]
    
    x=np.zeros(len(X.columns))
    x[0] = Elev
    x[1] = Aspe
    x[2] = Slope
    x[3]= H_Hydrology
    x[4] = V_Hydrology
    x[5] = Roadways
    x[6] = hill9
    x[7] = hillnoon
    x[8] = hill3
    x[9] = firept
    x[10] = soil
    
    if area_index >= 0:
        x[area_index]= 1
    
    scaled = scaler.transform([x])
        
    return lgbm_classifier.predict(scaled)[0].round(2)
   

In [None]:
result = predict(2861,159,20,124,11,2967,228,242,146,2588,'Wilderness_Area3',40)
if result == 1 :
    print('Cover By Spruce/Fir Trees')
elif result == 2:
    print('Cover By Lodgepole Pine Trees')
elif result == 3:
    print('Cover By Ponderosa Pine Trees')
elif result == 4:
    print('Cover By Cottonwood/Willow Trees')
elif result ==5:
    print('Cover By Aspen Trees')
elif result==6:
    print('Cover By Douglas-fir Trees')
elif result == 8:
    print('Cover By KrummholzTrees')