## How to check Python Environment in Jupyter

In [46]:
import sys

print(sys.executable)

/Users/hk/anaconda3/bin/python


## How to check all the installed libraries in the conda environment

In [1]:
! conda list

# packages in environment at /Users/hk/anaconda3:
#
# Name                    Version                   Build  Channel
_py-xgboost-mutex         2.0                       cpu_0  
anaconda-client           1.11.2          py310hca03da5_0  
anaconda-navigator        2.4.0           py310hca03da5_0  
anaconda-project          0.11.1          py310hca03da5_0  
anyio                     3.7.0              pyhd8ed1ab_1    conda-forge
appdirs                   1.4.4              pyhd3eb1b0_0  
appnope                   0.1.3              pyhd8ed1ab_0    conda-forge
argon2-cffi               21.3.0             pyhd8ed1ab_0    conda-forge
argon2-cffi-bindings      21.2.0          py310h1a28f6b_0  
asttokens                 2.2.1              pyhd8ed1ab_0    conda-forge
async-lru                 2.0.2              pyhd8ed1ab_0    conda-forge
attrs                     22.1.0          py310hca03da5_0  
babel                     2.12.1             pyhd8ed1ab_1    conda-forge
backcall               

## How to install a package using conda

In [None]:
"""!conda install package_name -y"""

# The -y tag is used to confirm the installation without prompting the user for confirmation

# useful for automation and scripting 

## How to launch jupyter lab from the terminal


1. Go to the directory where you want to launch
2. Create an enviroment using the following command
    conda activate <env_name>
3. jupyter lab


## Working with CSV file

In [1]:
import pandas as pd

In [7]:
land_temps_df = pd.read_csv("./raw_data/landtempssample.csv")

In [8]:
land_temps_df.columns

Index(['locationid', 'year', 'month', 'temp', 'latitude', 'longitude',
       'stnelev', 'station', 'countryid', 'country'],
      dtype='object')

In [10]:
df = land_temps_df


## Renaming a couple of columns

In [15]:
df = df.rename(columns = {"locationid" : "station_id",
                     "temp" : "avg_temp",
                     "stnelev" : "elevation"})
                     

### Why did we reassign the dataframe?

In [None]:
"""

Because any modification on the dataframe is done on a new modified dataframe
by default. It uses "inplace = False" argument by default.

To avoid this, we can give an argument - "inplace = True". This saves memory as 
the modifications are done on the same dataframe. This is good to improve 
performance on large datasets.

So in the above example, if we want to save memory and make changes on the
same dataframe , we can use inplace = True argument as follows:


df.rename(columns = {"locationid" : "station_id",
                     "temp" : "avg_temp",
                     "stnelev" : "elevation"},inplace= True)



Note:
	•	inplace=True saves memory but does not allow undoing changes easily.
	•	inplace=False (default) ensures that the original DataFrame 
 remains unchanged unless explicitly reassigned.

"""

### How to check the memory usage?

In [29]:
# Check memory Usage of column

df.memory_usage(deep = True)

Index             128
station_id    6800000
year           800000
month          800000
avg_temp       800000
latitude       800000
longitude      800000
elevation      800000
station       6870902
countryid     5900000
country       6701794
date           800000
dtype: int64

In [None]:
"""	
•	This returns the memory usage (in bytes) for each column.
•	The deep=True argument ensures it accounts for object (string) 
data types accurately.
 """

In [33]:
# Check total memory usage of the dataframe

df.memory_usage(deep= True).sum()

31872824

In [None]:
"""	This returns the total memory usage of the entire DataFrame in bytes."""

In [34]:
# Check memory usage with .info()

df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   station_id  100000 non-null  object        
 1   year        100000 non-null  int64         
 2   month       100000 non-null  int64         
 3   avg_temp    85554 non-null   float64       
 4   latitude    100000 non-null  float64       
 5   longitude   100000 non-null  float64       
 6   elevation   100000 non-null  float64       
 7   station     100000 non-null  object        
 8   countryid   100000 non-null  object        
 9   country     99995 non-null   object        
 10  date        100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(4)
memory usage: 30.4 MB


In [36]:
# Convert Bytes to MB or GB for better readability

memory_in_mb = df.memory_usage(deep= True).sum()/(1024**2)
print (f"Memory Usage:{memory_in_mb:.2f} MB")

Memory Usage:30.40 MB


## Checking the attributes of the dataframe

### Check the column names

In [27]:
df.columns

Index(['station_id', 'year', 'month', 'avg_temp', 'latitude', 'longitude',
       'elevation', 'station', 'countryid', 'country', 'date'],
      dtype='object')

### Check the data_types

In [20]:
df.dtypes

station_id            object
year                   int64
month                  int64
avg_temp             float64
latitude             float64
longitude            float64
elevation            float64
station               object
countryid             object
country               object
date          datetime64[ns]
dtype: object

### Check the shape of the dataframe

In [37]:
df.shape

(100000, 11)

## Creating a new column combining "month" and "year" columns

In [17]:
df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str)+ "-01")

In [18]:
df.columns

Index(['station_id', 'year', 'month', 'avg_temp', 'latitude', 'longitude',
       'elevation', 'station', 'countryid', 'country', 'date'],
      dtype='object')

In [19]:
df.head()

Unnamed: 0,station_id,year,month,avg_temp,latitude,longitude,elevation,station,countryid,country,date
0,USS0010K01S,2000,4,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States,2000-04-01
1,CI000085406,1940,5,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile,1940-05-01
2,USC00036376,2013,12,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States,2013-12-01
3,ASN00024002,1963,2,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia,1963-02-01
4,ASN00028007,2001,11,,-14.7803,143.5036,79.4,MUSGRAVE,AS,Australia,2001-11-01


### Why do we use parentheses() sometimes and we do not use them other times? 

In [25]:
The difference between using parentheses () and not using them 
in pandas depends on whether you’re accessing an attribute or calling a method.


1️⃣ Attributes (No Parentheses)
	•	Attributes store information about the DataFrame 
    and do not require parentheses.


df.columns  # Returns the column names
df.dtypes   # Returns the data types of each column
df.shape    # Returns the number of rows and columns as a tuple (rows, columns)
df.index    # Returns the index labels of the DataFrame

Why no parentheses?
	•	These are properties of the DataFrame that simply hold values, 
    so we access them directly.

2️⃣ Methods (With Parentheses)
	•	Methods perform an action on the DataFrame, so they require parentheses.
	•	Examples:

df.head()      # Returns the first 5 rows of the DataFrame
df.info()      # Prints a summary of the DataFrame
df.describe()  # Generates summary statistics
df.rename()    # Renames columns or index

Why use parentheses?
	•	Methods are functions that do something to the DataFrame, 
    and parentheses allow you to execute them.

	•	If you’re getting some information about the DataFrame → No parentheses (e.g., df.columns).
	•	If you’re doing something (modifying, summarizing, etc.) → Use parentheses (e.g., df.head()).

SyntaxError: invalid character '’' (U+2019) (3400548944.py, line 1)

## Getting the info of the dataframe

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   station_id  100000 non-null  object        
 1   year        100000 non-null  int64         
 2   month       100000 non-null  int64         
 3   avg_temp    85554 non-null   float64       
 4   latitude    100000 non-null  float64       
 5   longitude   100000 non-null  float64       
 6   elevation   100000 non-null  float64       
 7   station     100000 non-null  object        
 8   countryid   100000 non-null  object        
 9   country     99995 non-null   object        
 10  date        100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(4)
memory usage: 8.4+ MB


### Getting statistical Data of a column

In [39]:
df.avg_temp.describe()

count    85554.000000
mean        10.920770
std         11.522444
min        -70.700000
25%          3.460000
50%         12.220000
75%         19.570000
max         39.950000
Name: avg_temp, dtype: float64

### Checking the missing values 

In [40]:
df.isnull().sum()

station_id        0
year              0
month             0
avg_temp      14446
latitude          0
longitude         0
elevation         0
station           0
countryid         0
country           5
date              0
dtype: int64

### Dropping the rows where there are missing values in atleast one of the columns

In [44]:
df.shape

(85552, 11)

In [45]:
df.dropna(subset = ['avg_temp','country'], inplace = True)

# we can optionally use how = "any" argument 

In [43]:
df.shape

(85552, 11)

In [None]:
### Drop rows when missing values in both columns - use how = "all" argument

df.dropna(subset = ["avg_temp","country"], inplace = True, how="all")