In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('iris.csv')
print(df)

     sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]


In [30]:
# CLEANING

# Display first few rows to understand the structure of the dataset
print("Initial Dataset:\n", df.head())

# 1. Check for Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# 2. Remove Rows with Missing Values (if any)
df = df.dropna()

# 3. Check for Negative Values (to remove erroneous data in length/width columns)
negative_values = df[(df['sepal.length'] < 0) |
                     (df['sepal.width'] < 0) |
                     (df['petal.length'] < 0) |
                     (df['petal.width'] < 0)]

if not negative_values.empty:
    print("\nNegative values detected:\n", negative_values)
    # Remove rows with negative values
    df = df[~df.isin(negative_values)].dropna()

# 4. Remove Duplicates
df = df.drop_duplicates()

# 5. Verify Data Types
print("\nData Types:\n", df.dtypes)

# After cleaning, let's display the cleaned dataset
print("\nCleaned Dataset:\n", df.head())

# Now you can continue with the operations as per the earlier instructions


Initial Dataset:
    sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa

Missing Values:
 sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

Data Types:
 sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

Cleaned Dataset:
    sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.

In [31]:
# SUBSET
setosa_df = df[df['variety']=='Setosa']
versicolor_df = df[df['variety']=='Versicolor']
viriginica_df = df[df['variety']=='Virginica']
print(setosa_df.head())
print(versicolor_df.head())
print(viriginica_df.head())

   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa
    sepal.length  sepal.width  petal.length  petal.width     variety
50           7.0          3.2           4.7          1.4  Versicolor
51           6.4          3.2           4.5          1.5  Versicolor
52           6.9          3.1           4.9          1.5  Versicolor
53           5.5          2.3           4.0          1.3  Versicolor
54           6.5          2.8           4.6          1.5  Versicolor
     sepal.length  sepal.width  petal.length  petal.width    variety
100           6.3          3.3           6.0          2.5  Virginica
101           5.8          2.7           5.1          1.9  Virgi

In [32]:
# MERGE
setosa_df.loc[:,'id'] = range(len(setosa_df))
versicolor_df.loc[:,'id'] = range(len(versicolor_df))
merged_df = pd.merge(setosa_df,versicolor_df,on='id',suffixes=('_setosa','_versicolor'))
print(merged_df.head())

   sepal.length_setosa  sepal.width_setosa  petal.length_setosa  \
0                  5.1                 3.5                  1.4   
1                  4.9                 3.0                  1.4   
2                  4.7                 3.2                  1.3   
3                  4.6                 3.1                  1.5   
4                  5.0                 3.6                  1.4   

   petal.width_setosa variety_setosa  id  sepal.length_versicolor  \
0                 0.2         Setosa   0                      7.0   
1                 0.2         Setosa   1                      6.4   
2                 0.2         Setosa   2                      6.9   
3                 0.2         Setosa   3                      5.5   
4                 0.2         Setosa   4                      6.5   

   sepal.width_versicolor  petal.length_versicolor  petal.width_versicolor  \
0                     3.2                      4.7                     1.4   
1                     3.2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  setosa_df.loc[:,'id'] = range(len(setosa_df))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  versicolor_df.loc[:,'id'] = range(len(versicolor_df))


In [33]:
# SORT
sorted_df = df.sort_values(by='petal.length',ascending=False)
print(sorted_df)

     sepal.length  sepal.width  petal.length  petal.width    variety
118           7.7          2.6           6.9          2.3  Virginica
117           7.7          3.8           6.7          2.2  Virginica
122           7.7          2.8           6.7          2.0  Virginica
105           7.6          3.0           6.6          2.1  Virginica
131           7.9          3.8           6.4          2.0  Virginica
..            ...          ...           ...          ...        ...
40            5.0          3.5           1.3          0.3     Setosa
35            5.0          3.2           1.2          0.2     Setosa
14            5.8          4.0           1.2          0.2     Setosa
13            4.3          3.0           1.1          0.1     Setosa
22            4.6          3.6           1.0          0.2     Setosa

[149 rows x 5 columns]


In [34]:
# TRANSPOSE
transposed_df = df.head(3).transpose()
print(transposed_df)

                   0       1       2
sepal.length     5.1     4.9     4.7
sepal.width      3.5     3.0     3.2
petal.length     1.4     1.4     1.3
petal.width      0.2     0.2     0.2
variety       Setosa  Setosa  Setosa


In [46]:
# MELT (Wide -> Long)
melted_df = pd.melt(df, id_vars=['variety'], value_vars=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])
print(melted_df)


       variety      variable  value
0       Setosa  sepal.length    5.1
1       Setosa  sepal.length    4.9
2       Setosa  sepal.length    4.7
3       Setosa  sepal.length    4.6
4       Setosa  sepal.length    5.0
..         ...           ...    ...
591  Virginica   petal.width    2.3
592  Virginica   petal.width    1.9
593  Virginica   petal.width    2.0
594  Virginica   petal.width    2.3
595  Virginica   petal.width    1.8

[596 rows x 3 columns]


In [48]:
# PIVOT - CASTING back to wide

# Using pivot_table
pivot_df = melted_df.pivot_table(index='variety',columns='variable',values='value',aggfunc='mean')
print(pivot_df)

# Using pivot
# Step 1: Add a unique row index
# df = df.reset_index()  # 'index' will now be a column
# melted_df = pd.melt(df, id_vars=['index','variety'], value_vars=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])
# print(melted_df)
# pivot_df = melted_df.pivot(index='index', columns='variable', values='value')
# pivot_df['variety'] = df['variety']
# print(pivot_df)

variable    petal.length  petal.width  sepal.length  sepal.width
variety                                                         
Setosa          1.462000     0.246000      5.006000     3.428000
Versicolor      4.260000     1.326000      5.936000     2.770000
Virginica       5.561224     2.028571      6.604082     2.979592
