In [None]:
import pandas as pd



### Concat or Union Operation in Pandas
- In Pandas, you can combine or concatenate DataFrames using union-like operations, which are performed using the concat() function or the append() method. 
- These operations are similar to SQL UNION and allow you to stack DataFrames either vertically (row-wise) or horizontally (column-wise).

In [None]:
# creating df1
emp= {"name":["Rohish","Rahul","Priya"],"gender":["Male","Male","Female"], "email":["rohish@gmail.com","rahul@gmail.com","priya@gmail.com"] }
df1 = pd.DataFrame(emp)
df1

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com


In [None]:
# creating df2
emp= {"name":["Pankaj","Sumit"],"gender":["Male","Male"], "email":["Pankaj@gmail.com","Sumit@gmail.com"] }
df2 = pd.DataFrame(emp)
df2

Unnamed: 0,name,gender,email
0,Pankaj,Male,Pankaj@gmail.com
1,Sumit,Male,Sumit@gmail.com


#### Vertical Concatenation (Row-wise)
This operation is similar to a SQL UNION ALL where you stack DataFrames on top of each other.

In [None]:
# union / appending 2 dataframes
pd.concat([df1, df2])
pd.concat([df1, df2], axis=0) # by default axix = 0 

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
0,Pankaj,Male,Pankaj@gmail.com
1,Sumit,Male,Sumit@gmail.com


In [None]:
# By default, the index is preserved. To ignore the index and reset it, you can use: ignore_index=True
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Pankaj,Male,Pankaj@gmail.com
4,Sumit,Male,Sumit@gmail.com


##### Using append() for Vertical Concatenation
You can also use the append() method for vertical concatenation.

In [None]:
df1.append(df2, ignore_index=True)

  df1.append(df2, ignore_index=True)


Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Pankaj,Male,Pankaj@gmail.com
4,Sumit,Male,Sumit@gmail.com


#### Horizontal Concatenation (Column-wise)
This operation aligns DataFrames side by side (like a SQL JOIN without a key).

In [None]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,name,gender,email,name.1,gender.1,email.1
0,Rohish,Male,rohish@gmail.com,Pankaj,Male,Pankaj@gmail.com
1,Rahul,Male,rahul@gmail.com,Sumit,Male,Sumit@gmail.com
2,Priya,Female,priya@gmail.com,,,


In [None]:
pd.concat([df1, df2], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,Rohish,Male,rohish@gmail.com,Pankaj,Male,Pankaj@gmail.com
1,Rahul,Male,rahul@gmail.com,Sumit,Male,Sumit@gmail.com
2,Priya,Female,priya@gmail.com,,,


### Dropping Duplicate Values
Pandas provides the drop_duplicates() function to accomplish this.


**Basic Syntax**:

**`df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)`**

- **subset**: Specifies which columns to consider for identifying duplicates. ***If None, all columns are used.***
- **keep**: Determines which duplicates to keep. Options are:
  - `first`: Keep the first occurrence of a duplicate row (default).
  - `last`: Keep the last occurrence of a duplicate row.
  - `False`: Drop all duplicate rows.
- **inplace**: If True, performs operation in place and modifies the DataFrame directly. If False, returns a new DataFrame.
- **ignore_index**: If True, the resulting DataFrame will reset the index.

In [None]:
# Sample DataFrame
emp= {"name":["Rohish","Rahul","Priya","Vicky","Rahul"],"gender":["Male","Male","Female","Male","Male"], "email":["rohish@gmail.com","rahul@gmail.com","priya@gmail.com","rohish@gmail.com","rahul@gmail.com"] }
df = pd.DataFrame(emp)
df

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Vicky,Male,rohish@gmail.com
4,Rahul,Male,rahul@gmail.com


##### Dropping Duplicates Across All Columns

In [None]:
df.drop_duplicates()

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Vicky,Male,rohish@gmail.com


#### Dropping Duplicates Based on Specific Columns
If you want to drop duplicates based on specific columns, use the subset parameter:

In [None]:
df.drop_duplicates(subset='email')

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com


In [None]:
# Keeping the Last Occurrence: by default it will be first
# df.drop_duplicates(subset='email', keep='first')
df.drop_duplicates(subset='email', keep='last')

Unnamed: 0,name,gender,email
2,Priya,Female,priya@gmail.com
3,Vicky,Male,rohish@gmail.com
4,Rahul,Male,rahul@gmail.com


In [None]:
df.drop_duplicates(subset = ['email','gender'] , keep = 'first')

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com


In [None]:
# if keep=False the it will grop all duplicate rows
df.drop_duplicates(subset = 'email' , keep = False)

Unnamed: 0,name,gender,email
2,Priya,Female,priya@gmail.com


In [None]:
# fnding duplicates in df
dfg= df.groupby('email')['name'].count()
dfg[dfg.values>1]

Out[32]: email
rahul@gmail.com     2
rohish@gmail.com    2
Name: name, dtype: int64

### Handling NaN (Not a Number) values
Pandas provides several methods to manage NaN values, depending on your specific needs. 

In [None]:
# Sample Dataframe
import numpy as np
emp= {"name":["Rohish","Rahul","Priya",'Smit',np.nan],"gender":[np.nan,"Male","Female",np.nan,np.nan ], "email":["rohish@gmail.com","rahul@gmail.com","priya@gmail.com",np.nan,np.nan] }
df = pd.DataFrame(emp)
df

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,,
4,,,


#### Detecting NaN Values
- You can detect NaN values in a DataFrame using the isnull() or isna() functions. 
- These functions return a DataFrame of the same shape as the original, with True where NaN values are present.

In [None]:
df.isnull()
df.isna()

Unnamed: 0,name,gender,email
0,False,True,False
1,False,False,False
2,False,False,False
3,False,True,True
4,True,True,True


In [None]:
# returns the rows where gender is nan
df[df['gender'].isna()]

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
3,Smit,,
4,,,


#### Dropping NaN Values
You can remove rows or columns containing NaN values using the dropna() function.

##### Dropping Rows with NaN Values

In [None]:
# Drop rows with any NaN values
df.dropna() # By default, dropna() removes rows containing any NaN values.
df.dropna(how='any') 

Unnamed: 0,name,gender,email
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com


In [None]:
# dropna() with how=all removes if all rows contains NaN values.
df.dropna(how='all')

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,,


In [None]:
# Dropping Rows with NaN Values in Specific Columns
df.dropna(subset=['name'])

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,,


##### Dropping Columns with NaN Values

In [None]:
# Drop columns with any NaN values
df.dropna(axis=1)
df.dropna(axis='columns')

0
1
2
3
4


In [None]:
df.dropna(how='all' , axis='columns')

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,,
4,,,


In [None]:
df.dropna(how='any',axis='columns')

0
1
2
3
4


In [None]:
df.dropna(subset=['name','email'] ,how='any',axis='index')

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com


#### Filling NaN Values
You can fill NaN values using the fillna() function.

In [None]:
# Filling NaN Values with a Constant
df.fillna(0)
df.fillna('values_missing')

Unnamed: 0,name,gender,email
0,Rohish,values_missing,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,values_missing,values_missing
4,values_missing,values_missing,values_missing


In [None]:
# You can fill NaN values with the mean, median, or mode of the column.
df.fillna(df.mean())
df.fillna(df.median())
df.fillna(df.mode().iloc[0])

  df.fillna(df.mean())
  df.fillna(df.median())


Unnamed: 0,name,gender,email
0,Rohish,Female,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,Female,priya@gmail.com
4,Priya,Female,priya@gmail.com


In [None]:
# Forward fill NaN values
df.ffill()

Unnamed: 0,name,gender,email
0,Rohish,,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,Female,priya@gmail.com
4,Smit,Female,priya@gmail.com


In [None]:
# Backward fill NaN values
df.bfill()

Unnamed: 0,name,gender,email
0,Rohish,Male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,,
4,,,


In [None]:
# You can replace NaN values with a specific value using the replace() function.
df.replace(to_replace=np.nan, value=0)

Unnamed: 0,name,gender,email
0,Rohish,0,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,0,0
4,0,0,0


In [None]:
# replacing specific column values
df['gender'].fillna(value='new_male', inplace=True)
df

Unnamed: 0,name,gender,email
0,Rohish,new_male,rohish@gmail.com
1,Rahul,Male,rahul@gmail.com
2,Priya,Female,priya@gmail.com
3,Smit,new_male,
4,,new_male,


### Readind from a clipboard

In [None]:
#  this code is not working in databicks attached the jupyter notebook code snippet
# clipboard_df = pd.read_clipboard(sep=',')
# clipboard_df

[0;31m---------------------------------------------------------------------------[0m
[0;31mPyperclipException[0m                        Traceback (most recent call last)
File [0;32m<command-3970698818403425>:1[0m
[0;32m----> 1[0m clipboard_df [38;5;241m=[39m pd[38;5;241m.[39mread_clipboard(sep[38;5;241m=[39m[38;5;124m'[39m[38;5;124m,[39m[38;5;124m'[39m)
[1;32m      2[0m clipboard_df

File [0;32m/databricks/python/lib/python3.9/site-packages/pandas/io/clipboards.py:43[0m, in [0;36mread_clipboard[0;34m(sep, **kwargs)[0m
[1;32m     40[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpandas[39;00m[38;5;21;01m.[39;00m[38;5;21;01mio[39;00m[38;5;21;01m.[39;00m[38;5;21;01mclipboard[39;00m [38;5;28;01mimport[39;00m clipboard_get
[1;32m     41[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpandas[39;00m[38;5;21;01m.[39;00m[38;5;21;01mio[39;00m[38;5;21;01m.[39;00m[38;5;21;01mparsers[39;00m [38;5;28;01mimport[39;00m read_csv
[0;32m---> 43[0m text [38;5;2

**Jupyter Notebook code snippet**

<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://raw.githubusercontent.com/rohish-zade/Python/master/python_data_manipulation_and_analysis/Pandas/read_clipboard.png" alt="read_clipboard" style="width: 600px">
</div>