<a href="https://colab.research.google.com/github/santhimaddipudi/ML_Handson/blob/main/EDA_for_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, model_selection

In [43]:
# For CSV
data = pd.read_csv("/content/sample_data/forbes.csv")


In [44]:
data.head()

Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
0,1,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,2,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,3,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services
3,4,JPMorgan Chase,United States,102.5,24.2,2513.0,306.6,Financials,Major Banks
4,5,Wells Fargo,United States,97.6,21.9,1943.4,274.4,Financials,Major Banks


In [23]:
data.tail()

Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
1995,1996,BEKB-BCBE,Switzerland,0.555,0.131,27.9,1.7,Financials,Regional Banks
1996,1997,Fastighets Balder,Sweden,0.63,0.639,10.2,3.8,Materials,
1997,1998,Akamai Technologies,United States,2.3,0.316,4.4,10.1,Information Technology,Computer Services
1998,1999,Oita Bank,Japan,0.523,0.071,27.9,0.595,Financials,Regional Banks
1999,2000,Tech Mahindra,India,4.2,0.469,3.6,6.7,Information Technology,


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tickers  5 non-null      object
 1   eps      5 non-null      object
 2   revenue  5 non-null      int64 
 3   price    5 non-null      object
 4   people   5 non-null      object
dtypes: int64(1), object(4)
memory usage: 328.0+ bytes


In [24]:
data.shape  # Returns (number_of_rows, number_of_columns)


(2000, 9)

In [25]:
data.describe()

Unnamed: 0,Rank,Sales,Profits,Assets,Market Value
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,17.66451,1.240713,84.533546,24.417844
std,577.494589,29.320116,2.918742,260.984489,44.7614
min,1.0,0.001,-13.0,0.001,0.072
25%,500.75,4.0,0.318,10.875,6.6
50%,1000.5,8.8,0.613,22.9,11.95
75%,1500.25,17.425,1.3,52.4,24.4
max,2000.0,485.3,45.2,3473.2,752.0


##Duplication and Missing Values Management

In [27]:
# Get duplicates
duplicates = data.duplicated()
duplicates


0       False
1       False
2       False
3       False
4       False
        ...  
1995    False
1996    False
1997    False
1998    False
1999    False
Length: 2000, dtype: bool

In [45]:
#Remove Duplicates
data = data.drop_duplicates()

In [46]:
missing_values = data.isnull().sum()
missing_values

Rank              0
Company           0
Country           0
Sales             0
Profits           0
Assets            0
Market Value      0
Sector          197
Industry        491
dtype: int64

In [47]:
# Filling missing values with the mean
data['Profits'] = data['Profits'].fillna(data['Profits'].mean())


In [48]:
# Dropping rows with missing values
data = data.dropna()

In [37]:
data.head()

Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
0,1,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,2,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,3,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services
3,4,JPMorgan Chase,United States,102.5,24.2,2513.0,306.6,Financials,Major Banks
4,5,Wells Fargo,United States,97.6,21.9,1943.4,274.4,Financials,Major Banks


##Data Reduction

In [49]:
#DROP irrelevant columns
data = data.drop(['irrelevant_column1', 'irrelevant_column2'], axis=1)
#Remove columns with constant values as they don’t add any information.
data = data.loc[:, data.apply(pd.Series.nunique) != 1]

KeyError: "['irrelevant_column1', 'irrelevant_column2'] not found in axis"

##Feature Engineering:Feature Engineering is the process of using domain knowledge to create new features from the raw data, which can significantly improve the performance of machine learning models.

In [50]:
# Example: Creating a feature for the age of a car from the 'Year' column
#data['Car_Age'] = current_year - data['Year']

# Example: Binning ages into groups
data['Profits_group'] = pd.cut(data['Profits'], bins=[0, 30,60,100], labels=['low', 'min',  'high'])

# Example: Extracting a brand name from a product description
data['Industry_catgory'] = data['Industry'].apply(lambda x: x.split()[0])

In [51]:
data

Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry,Profits_group,Industry_catgory
0,1,ICBC,China,151.400,42.000,3473.2,229.800,Financials,Major Banks,min,Major
1,2,China Construction Bank,China,134.200,35.000,3016.6,200.500,Financials,Regional Banks,min,Regional
2,3,Berkshire Hathaway,United States,222.900,24.100,620.9,409.900,Financials,Investment Services,low,Investment
3,4,JPMorgan Chase,United States,102.500,24.200,2513.0,306.600,Financials,Major Banks,low,Major
4,5,Wells Fargo,United States,97.600,21.900,1943.4,274.400,Financials,Major Banks,low,Major
...,...,...,...,...,...,...,...,...,...,...,...
1993,1994,Shaanxi Coal Industry,China,3.800,-0.024,13.6,9.000,Materials,Diversified Metals & Mining,,Diversified
1994,1995,Aurubis,Germany,10.600,0.249,4.5,3.100,Materials,Diversified Metals & Mining,low,Diversified
1995,1996,BEKB-BCBE,Switzerland,0.555,0.131,27.9,1.700,Financials,Regional Banks,low,Regional
1997,1998,Akamai Technologies,United States,2.300,0.316,4.4,10.100,Information Technology,Computer Services,low,Computer


##Creating Features