In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
%matplotlib inline
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [3]:
import statsmodels.api as sm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [4]:
bikes = pd.read_csv("uci_edu_day.csv" , sep=",")

# **Identification the outliers**

Z-score method: 

The z-score measures how many standard deviations a data point is from the mean. By calculating the z-score for each data point, you can identify any points that are more than a certain number of standard deviations away from the mean (e.g. z-score greater than 3 or less than -3) and consider them to be outliers.

In [5]:
bikes_zscore = bikes[['temp' ,'hum', 'atemp']]
bikes_zscore.head()

Unnamed: 0,temp,hum,atemp
0,0.344167,0.805833,0.363625
1,0.363478,0.696087,0.353739
2,0.196364,0.437273,0.189405
3,0.2,0.590435,0.212122
4,0.226957,0.436957,0.22927


In [6]:
# Generate some example data
#bikes_zscore = pd.DataFrame({'temp': np.random.normal(0, 1, 100), # Generates 100 random values from a normal distribution with mean 0 and standard deviation 1, and assigns them to the 'temp' column
#                     'atemp': np.random.normal(10, 2, 100), # Generates 100 random values from a normal distribution with mean 10 and standard deviation 2, and assigns them to the 'atemp' column
#                     'hum': np.random.normal(-5, 5, 100)}) # Generates 100 random values from a normal distribution with mean -5 and standard deviation 5, and assigns them to the 'hum' column


                    



#dteday	season	yr	mnth	holiday	weekday	workingday	weathersit	***temp	atemp	hum**	windspeed	casual	registered	cnt

# Define a function to calculate the z-score for each data point
def zscore(series):
    return (series - series.mean()) / series.std()

# Apply the zscore function to each column of the data frame
z_scores = bikes_zscore.apply(zscore)

# Identify any data points with a z-score greater than 3 or less than -3
outliers = bikes_zscore[(z_scores > 4).any(axis=1) | (z_scores < -4).any(axis=1)]

# Print the outliers
print(outliers)




        temp  hum     atemp
68  0.389091  0.0  0.385668


In [7]:
bikes

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


In [8]:
#In this code, we first generate some example data in the form of a Pandas DataFrame. We then define a function zscore that takes a Pandas Series (i.e. a single column of the DataFrame) as input and returns the z-score for each data point in that column. We apply this function to each column of the DataFrame using the apply method.

#The code you provided generates a Pandas DataFrame called bikes with three columns, each containing 100 randomly generated values from a normal distribution with different mean and standard deviation parameters.

#Next, we identify any data points with a z-score greater than 3 or less than -3 using the any method to check whether any values in the DataFrame are outside of the specified range. We use the | operator to combine the results for each column, so that a row will be considered an outlier if any of its columns have a z-score outside of the specified range. Finally, we print the outliers, which are any rows of the original DataFrame that have at least one outlier in any of their columns.

# ... The choice of z-score range to use when identifying outliers using the z-score method is somewhat subjective and may depend on the specific characteristics of your data and your analysis goals. A common convention is to consider any data point with a z-score greater than 3 or less than -3 to be an outlier, as these values correspond to data points that are more than 3 standard deviations away from the mean in either direction.
#However, the choice of z-score range should be based on your understanding of the distribution of your data and the potential sources of variability. For example, if your data has a very wide distribution or contains extreme values, you may want to use a larger z-score range to identify outliers. Conversely, if your data has a very narrow distribution or contains few extreme values, you may want to use a smaller z-score range to avoid identifying too many false positives as outliers.
#Ultimately, it's important to carefully consider the potential impact of outliers on your analysis and to use a range that makes sense for your specific research question and data characteristics.

#### To remove or handle the outliers

In [9]:
# Remove outliers from the DataFrame
bikes_no_outliers = z_scores.drop(outliers.index)

In [10]:
bikes_no_outliers

Unnamed: 0,temp,hum,atemp
0,-0.826097,1.249316,-0.679481
1,-0.720601,0.478785,-0.740146
2,-1.633538,-1.338358,-1.748570
3,-1.613675,-0.263001,-1.609168
4,-1.466410,-1.340576,-1.503941
...,...,...,...
726,-1.317763,0.175687,-1.520067
727,-1.322319,-0.266056,-1.345768
728,-1.322319,0.877791,-1.423370
729,-1.308661,-1.014969,-1.489029


#### The example code snippet that demonstrates how to identify outliers using the IQR method:

python
Copy code
.# Generate a sample DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))

.# Calculate the first and third quartiles
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

.# Calculate the IQR
IQR = Q3 - Q1

...# Define outliers as any value below Q1 - 1.5*IQR or above Q3 + 1.5*IQR
outliers = df[(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))].dropna()

#### ...It has worked?

The Interquartile range (IQR) method:, you can use the quantile method of Pandas DataFrame to compute the first and third quartiles, and then calculate the IQR as the difference between them.

Once you have computed the IQR, you can define outliers as any data point that falls below the first quartile minus 1.5 times the IQR, or above the third quartile plus 1.5 times the IQR.



In this code, we first generate a sample DataFrame df with 100 rows and 4 columns of random data. We then calculate the first and third quartiles using the quantile method of the DataFrame. We use a parameter of 0.25 and 0.75 to specify the first and third quartiles, respectively.

We then calculate the IQR as the difference between the third and first quartiles. Finally, we define outliers as any data point that falls below Q1 - 1.5IQR or above Q3 + 1.5IQR. The resulting outliers are stored in a new DataFrame called outliers.

**Note:
 ...that this method assumes that the data is normally distributed. If the data is skewed or has a different distribution, other methods such as the z-score method or visual inspection may be more appropriate for identifying outliers.