In [80]:
import pandas as pd
import warnings

In [81]:
warnings.simplefilter(action='ignore', category=FutureWarning)

### Creating a DataFrame from dictionary

In [82]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [70000, 80000, 65000, 120000, 95000]
}

In [83]:
type(data)

dict

In [84]:
df = pd.DataFrame(data)
print(df)

      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   65000
3    David   32      Houston  120000
4      Eve   29      Phoenix   95000


### display

In [85]:
# first 5 values
print("Head of the DataFrame:\n", df.head())

Head of the DataFrame:
       Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   65000
3    David   32      Houston  120000
4      Eve   29      Phoenix   95000


In [86]:
# first 2 values
print("First 2 values:\n", df.head(2))

First 2 values:
     Name  Age         City  Salary
0  Alice   24     New York   70000
1    Bob   27  Los Angeles   80000


In [87]:
# last 5 values
print("\nTail of the DataFrame:\n", df.tail(2))


Tail of the DataFrame:
     Name  Age     City  Salary
3  David   32  Houston  120000
4    Eve   29  Phoenix   95000


In [88]:
# shape of the dataframe
print("\nShape of the DataFrame:", df.shape)


Shape of the DataFrame: (5, 4)


Format: [rows, columns]

In [89]:
# summary statistics
print("\nSummary statistics:\n", df.describe())


Summary statistics:
              Age         Salary
count   5.000000       5.000000
mean   26.800000   86000.000000
std     3.962323   22192.341021
min    22.000000   65000.000000
25%    24.000000   70000.000000
50%    27.000000   80000.000000
75%    29.000000   95000.000000
max    32.000000  120000.000000


### Column operations

In [90]:
# column names
print("\nColumn names:", df.columns)


Column names: Index(['Name', 'Age', 'City', 'Salary'], dtype='object')


In [91]:
# a single column
print("\nSelect a single column (Age):\n", df['Age'])


Select a single column (Age):
 0    24
1    27
2    22
3    32
4    29
Name: Age, dtype: int64


In [92]:
df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Alice,24
1,Bob,27
2,Charlie,22
3,David,32
4,Eve,29


In [93]:
# multiple columns
print("\nSelect multiple columns (Name and Salary):\n", df[['Name', 'Salary']])


Select multiple columns (Name and Salary):
       Name  Salary
0    Alice   70000
1      Bob   80000
2  Charlie   65000
3    David  120000
4      Eve   95000


In [94]:
df[df['Salary']>70000] #Comparing all the "Salary" rows, and only selecting the ones which are having salary greater than 70000

Unnamed: 0,Name,Age,City,Salary
1,Bob,27,Los Angeles,80000
3,David,32,Houston,120000
4,Eve,29,Phoenix,95000


In [95]:

df["Salary"]

0     70000
1     80000
2     65000
3    120000
4     95000
Name: Salary, dtype: int64

In [96]:
# Filter rows based on a condition
print("\nRows where Age > 25:\n", df[df['Age'] > 25])


Rows where Age > 25:
     Name  Age         City  Salary
1    Bob   27  Los Angeles   80000
3  David   32      Houston  120000
4    Eve   29      Phoenix   95000


In [97]:
# Add a new column
df['Experience'] = [2, 5, 1, 10,8]
print("\nDataFrame after adding Experience column:\n", df)


DataFrame after adding Experience column:
       Name  Age         City  Salary  Experience
0    Alice   24     New York   70000           2
1      Bob   27  Los Angeles   80000           5
2  Charlie   22      Chicago   65000           1
3    David   32      Houston  120000          10
4      Eve   29      Phoenix   95000           8


In [98]:
print([1, 2])

[1, 2]


In [99]:
# Remove a column
df=df.drop('City', axis=1)
print("\nDataFrame after removing City column:\n", df)


DataFrame after removing City column:
       Name  Age  Salary  Experience
0    Alice   24   70000           2
1      Bob   27   80000           5
2  Charlie   22   65000           1
3    David   32  120000          10
4      Eve   29   95000           8


Similarly of we had to drop a whole row, then we should be setting up the axis as 0 and define the index name, here the index that we have is the serial Nos (i.e 0,1,2,3,4), the serial numbers become the identifier.

**Note**: Here you woule be setting the axis as 1 because of the *Format* that we discussed before ([Rows, Columns]), and our requirement was to remove a row 

In [100]:
df.drop(0, axis=0) # Here i want to remove the first row. Hence I will mention the identifier value (in our case is index) and axis value

Unnamed: 0,Name,Age,Salary,Experience
1,Bob,27,80000,5
2,Charlie,22,65000,1
3,David,32,120000,10
4,Eve,29,95000,8


In [101]:
# Rename columns
df.rename(columns={'Name': 'Employee Name'}, inplace=True)
print("\nDataFrame after renaming columns:\n", df)


DataFrame after renaming columns:
   Employee Name  Age  Salary  Experience
0         Alice   24   70000           2
1           Bob   27   80000           5
2       Charlie   22   65000           1
3         David   32  120000          10
4           Eve   29   95000           8


The `inplace=True` has to be set because it would make an **permanent change in your dataframe** 

### Handling missing values

In [102]:
# Adding a NaN value for demonstration
df.at[2, 'Age'] = None
print("\nDataFrame with NaN value:\n", df)


DataFrame with NaN value:
   Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie   NaN   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           8


Now you have to know where the missing values is, this is a simple data if you work for a industry the size of the data incereases alot and finding manually from them is impractical herce we use the following syntax to,
* To find the null values as in the form of table
* To find the total number of null values in each columns

In [103]:
#To find the null values as in the form of table
print (df.isna())

   Employee Name    Age  Salary  Experience
0          False  False   False       False
1          False  False   False       False
2          False   True   False       False
3          False  False   False       False
4          False  False   False       False


Here the whole table was iterated and wherever the null values were present an `True` boolean value was setup.

In [104]:
#To find total number of null values
print (df.isna().sum())

Employee Name    0
Age              1
Salary           0
Experience       0
dtype: int64


So from the above data we got to know we had a null value in the age column hence missing data input should be done in those columns. Hence the above code helps us identify where the null values are

In [105]:
df["Age"].mean()

28.0

In [106]:
#Filling the age column with mean
test_df=df[:] #--> This syntax df[:] copys the dataframe into the variable test_df
test_df["Age"].fillna(test_df["Age"].mean(), inplace=True)
test_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["Age"].fillna(test_df["Age"].mean(), inplace=True)


Unnamed: 0,Employee Name,Age,Salary,Experience
0,Alice,24.0,70000,2
1,Bob,27.0,80000,5
2,Charlie,28.0,65000,1
3,David,32.0,120000,10
4,Eve,29.0,95000,8


In the above cell we filled the `NAN` values with the mean, similarlyif we had to fill the `NAN` with median or mode.

In [107]:
test2_df=df[:]
test3_df=df[:]
print ("Dataframe where the Age column has been filled with the median")
test2_df["Age"].fillna(test2_df["Age"].median(), inplace=True)
print (test2_df)
print ("Dataframe where the Age column has been filled with the mean")
test3_df["Age"].fillna(test3_df["Age"].mean(), inplace=True)
print(test3_df)

Dataframe where the Age column has been filled with the median
  Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie  28.0   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           8
Dataframe where the Age column has been filled with the mean
  Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie  28.0   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           8


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2_df["Age"].fillna(test2_df["Age"].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test3_df["Age"].fillna(test3_df["Age"].mean(), inplace=True)


Now what if we had a null value in the Object Type, we use the `bfill` and `ffill`

In [108]:
df.at[2, "Employee Name"]=None

In [109]:
df.head()

Unnamed: 0,Employee Name,Age,Salary,Experience
0,Alice,24.0,70000,2
1,Bob,27.0,80000,5
2,,28.0,65000,1
3,David,32.0,120000,10
4,Eve,29.0,95000,8


In [114]:
df=df.ffill()

Did a forward fill in the NAN values, here you should not mention the columns as it identifies on it own. 

**Note**: If you want to do handling of missing data to the int64 or float64 you need to do before handed itself, after doing those you could perform handling of missing data on the object type data. If done in revere form then,
* The null values in int columns would be filled with wrong/mathematically wrong values
* Improper predictions by ML Model

In [115]:
df.head()

Unnamed: 0,Employee Name,Age,Salary,Experience
0,Alice,24.0,70000,2
1,Bob,27.0,80000,5
2,Bob,28.0,65000,1
3,David,32.0,120000,10
4,Eve,29.0,95000,8


### Grouping and sorting data

In [129]:
df.loc[len(df)]=["Rohit",20,100000000,2]
df

Unnamed: 0,Employee Name,Age,Salary,Experience
0,Alice,24.0,70000,2
1,Bob,27.0,80000,5
2,Bob,28.0,65000,1
3,David,32.0,120000,10
4,Eve,29.0,95000,8
5,Rohit,20.0,70000,2
6,Rohit,20.0,100000000,2


In [136]:
#cell 1
print(df.groupby("Experience").sum()) # This basically clubbed all the values, notice how the columsn with experience clubbed all the common experience row values eg "AliceRohitRohit" with same experience

              Employee Name   Age     Salary
Experience                                  
1                       Bob  28.0      65000
2           AliceRohitRohit  64.0  100140000
5                       Bob  27.0      80000
8                       Eve  29.0      95000
10                    David  32.0     120000


In [130]:
# Grouping data
grouped = df.groupby('Experience')["Salary"].mean()
print("\nGrouped DataFrame by Experience:\n", grouped)


Grouped DataFrame by Experience:
 Experience
1        65000.0
2     33380000.0
5        80000.0
8        95000.0
10      120000.0
Name: Salary, dtype: float64


In the above cell we built a step code, which does teh following:
* df.groupby(): This would group all the values which are in common into a single row, for example: we had the Experience set as parameter, now this group value would club all the experiences which has 1, 2, 5, 8, 10 and return a distinct dataframe with all the clubbed values. 


* df.groupby("Experience"): Join all the rows which have same values in experience columns. (goto cell 1 for viewing the expample data) 


* df.groupby("Experience")["Salary"]: For calculating the mean we need some value, and that value is the "Salary", Note the value inside the "[]" shiuld be a int or float datatype.


* df.groupby('Experience')["Salary"].mean(): Calculate the mean!

In [126]:
df["Experience"].astype(str)

0     2
1     5
2     1
3    10
4     8
5     2
Name: Experience, dtype: object

In [57]:
# Sorting data
sorted_df = df.sort_values(by='Salary', ascending=False)
print("\nDataFrame sorted by Salary (descending):\n", sorted_df)


DataFrame sorted by Salary (descending):
   Employee Name  Emp Age  Salary  Experience
3         David       32  120000          10
4           Eve       29   95000           8
1           Bob       27   80000           5
0         Alice       24   70000           2
5         Rohit       20   70000           2
2          None       22   65000           1


In [58]:
sorted_df.reset_index()

Unnamed: 0,index,Employee Name,Emp Age,Salary,Experience
0,3,David,32,120000,10
1,4,Eve,29,95000,8
2,1,Bob,27,80000,5
3,0,Alice,24,70000,2
4,5,Rohit,20,70000,2
5,2,,22,65000,1


In [59]:
sorted_df=sorted_df.iloc[:, 1:]

# sorted_df

### Saving data to csv and reading from it

In [61]:
# Saving DataFrame to a CSV file
df.to_csv('employee_data1.csv', index=False)
print("\nDataFrame saved to 'employee_data.csv'")

# Reading DataFrame from a CSV file
df_from_csv = pd.read_csv('employee_data.csv')
print("\nDataFrame read from 'employee_data.csv':\n", df_from_csv)


DataFrame saved to 'employee_data.csv'

DataFrame read from 'employee_data.csv':
   Employee Name  Emp Age  Salary  Experience
0         Alice       24   70000           2
1           Bob       27   80000           5
2           NaN       22   65000           1
3         David       32  120000          10
4           Eve       29   95000           8
5         Rohit       20   70000           2
