In [14]:
# 1. Use Views Instead of Copies
# 🔹 Description:
# NumPy lets you create views (shallow copies) of an array instead of deep copies.

# 🔹 Code:
# python
# Copy
# Edit
# import numpy as np

# a = np.array([1, 2, 3, 4, 5])
# b = a[1:4]   # This is a view
# b[0] = 99

# print("Original array:", a)  # a is also changed
# 🔹 Why it's memory-efficient:
# No new data is created — just a view on the existing array.

# ✅ 2. Use astype() with copy=False
# 🔹 Description:
# When converting datatypes, avoid unnecessary copying.

# 🔹 Code:
# python
# Copy
# Edit
# arr = np.arange(10, dtype=np.float64)
# arr_int = arr.astype(np.int32, copy=False)  # Tries to avoid a copy
# ✅ 3. Use np.memmap for Large Files
# 🔹 Description:
# Memory-mapped arrays are used to read large binary files without fully loading them into memory.

# 🔹 Code:
# python
# Copy
# Edit
# data = np.memmap('large_array.dat', dtype='float32', mode='r', shape=(1000000,))
# print(data[100])  # Access data without loading all
# ✅ 4. Use Generators (if converting from lists)
# 🔹 Description:
# Instead of loading a huge list, use generators to yield data lazily.

# 🔹 Code:
# python
# Copy
# Edit
# def gen():
#     for i in range(1000000):
#         yield i

# arr = np.fromiter(gen(), dtype=np.int32)
# ✅ 5. Use np.resize() or np.reshape() Carefully
# reshape() returns a view (when possible), hence memory efficient.

# resize() modifies the original array and may copy.

# 🔹 Example:
# python
# Copy
# Edit
# a = np.arange(12)
# b = a.reshape(3, 4)  # View – no copy
# ✅ 6. Use Proper Data Types (dtype)
# 🔹 Description:
# Smaller dtype uses less memory (e.g., int8 instead of int64 if possible).

# 🔹 Code:
# python
# Copy
# Edit
# a = np.array([1, 2, 3], dtype=np.int64)
# b = a.astype(np.int8)

# print(b.nbytes)  # Less memory
# ✅ 7. Avoid Unnecessary Copies in Operations
# 🔹 Description:
# Operations like slicing, transposing, and broadcasting don't create full copies.

# 🔹 Example:
# python
# Copy
# Edit
# a = np.arange(100).reshape(10, 10)
# b = a.T   # Transpose – returns a view, not a copy
# ✅ 8. Use In-place Operations (+=, *=, etc.)
# 🔹 Description:
# These modify data without allocating new memory.

# 🔹 Code:
# python
# Copy
# Edit
# a = np.array([1, 2, 3])
# a += 5  # In-place

In [15]:
#Pandas Series
import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)







0    1
1    7
2    2
dtype: int64


In [16]:
a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)

x    1
y    7
z    2
dtype: int64


In [17]:
import pandas as pd

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories)

print(myvar)

day1    420
day2    380
day3    390
dtype: int64


In [18]:
import pandas as pd

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)

day1    420
day2    380
dtype: int64


In [19]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(d)
ser

Unnamed: 0,0
a,1
b,2
c,3


In [20]:
r = [1, 2]
ser = pd.Series(r, copy=False)
ser.iloc[0] = 999
r
ser

Unnamed: 0,0
0,999
1,2


In [21]:
import pandas as pd
import numpy as np

data = np.array(['g','e','e','k','s','f', 'o','r','g','e','e','k','s'])
ser = pd.Series(data)

print(ser[:5])

0    g
1    e
2    e
3    k
4    s
dtype: object


In [22]:
import pandas as pd

df = pd.read_csv("/content/nba.csv")

ser = pd.Series(df['Name'])
data = ser.head(10)
data

Unnamed: 0,Name
0,Avery Bradley
1,Jae Crowder
2,John Holland
3,R.J. Hunter
4,Jonas Jerebko
5,Amir Johnson
6,Jordan Mickey
7,Kelly Olynyk
8,Terry Rozier
9,Marcus Smart


In [23]:
data.loc[3:6]

Unnamed: 0,Name
3,R.J. Hunter
4,Jonas Jerebko
5,Amir Johnson
6,Jordan Mickey


In [24]:
import pandas as pd

data = {'col1': [10, 20, 30, 40],
        'col2': [100, 200, 300, 400]}
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D'])

# Using .loc
print("Using .loc:")
print(df.loc['B', 'col2'])  # Access a single cell by label
print(df.loc['A':'C', 'col1']) # Slice rows by label (inclusive)

# Using .iloc
print("\nUsing .iloc:")
print(df.iloc[1, 1])  # Access a single cell by integer position
print(df.iloc[0:3, 0]) # Slice rows by integer position (exclusive)

Using .loc:
200
A    10
B    20
C    30
Name: col1, dtype: int64

Using .iloc:
200
A    10
B    20
C    30
Name: col1, dtype: int64


In [25]:
import pandas as pd

ser1 = pd.Series([1,2,3],index =['A','B','C'])
ser2 = pd.Series([4,5,6],index = ['A','B','C'])

df_sum = ser1.cov(ser2)
print(df_sum)

1.0


In [26]:
import pandas as pd

ser = pd.Series([1,2,3,4])
ser = ser.astype(float)
print(ser)

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64


In [27]:
data_dict = {'Geeks':10,'for':20,'geeks':30}
ser = pd.Series(data_dict)
print(ser)

Geeks    10
for      20
geeks    30
dtype: int64


In [28]:
import pandas as pd

ser = pd.Series()

print(ser)

Series([], dtype: object)


In [29]:
import pandas as pd
import numpy as np

data = np.array(['g', 'e', 'e', 'k', 's'])

ser = pd.Series(data)
print(ser)

0    g
1    e
2    e
3    k
4    s
dtype: object


In [30]:
import pandas as pd

data_list = ['g', 'e', 'e', 'k', 's']

ser = pd.Series(data_list)
print(ser)

0    g
1    e
2    e
3    k
4    s
dtype: object


In [31]:
import pandas as pd

data_dict = {'Geeks': 10, 'for': 20, 'geeks': 30}

ser = pd.Series(data_dict)
print(ser)

Geeks    10
for      20
geeks    30
dtype: int64


In [32]:
import numpy as np
import pandas as pd


ser = pd.Series(np.linspace(1,10,5))

print(ser)

0     1.00
1     3.25
2     5.50
3     7.75
4    10.00
dtype: float64


In [33]:
#series using List Comprehension

ser = pd.Series(range(1,20,3),index = [x for x in 'abcdefg'])
print(ser)

a     1
b     4
c     7
d    10
e    13
f    16
g    19
dtype: int64


In [34]:
import pandas as pd

lst = ['Geeks','For','Geeks','is','portal','for','Geeks']
df = pd.DataFrame(lst)
print(df)



        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks


In [35]:
data = {'Name':['Tom','Nick','krish','jack'],
        'Age':[20,21,19,18]}

df = pd.DataFrame(data)
print(df)
print(df[['Name']])


    Name  Age
0    Tom   20
1   Nick   21
2  krish   19
3   jack   18
    Name
0    Tom
1   Nick
2  krish
3   jack


In [36]:
df = pd.read_csv('nba.csv',index_col= 'Name')
row2 = df.iloc[3]
print(row2)

Team        Boston Celtics
Number                28.0
Position                SG
Age                   22.0
Height                 6-5
Weight               185.0
College      Georgia State
Salary           1148640.0
Name: R.J. Hunter, dtype: object


In [37]:
data = pd.read_csv('nba.csv')
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [38]:
dict = {'First Score':[100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score':[np.nan, 40, 80, 98]}

df = pd.DataFrame(dict)

print([df.isnull().sum().index])

[Index(['First Score', 'Second Score', 'Third Score'], dtype='object')]


In [39]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [40]:
data.shape

(458, 9)

In [41]:
data.size

4122

In [78]:
import pandas as pd

# Create a sample DataFrame
df = pd.read_csv('nba.csv')
df.head()


df  = df.rename(columns = {'Name':'Permanent_name'})
df

Unnamed: 0,Permanent_name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [43]:
mv = df.isnull().sum()
print(mv)

Permanent_name     1
Team               1
Number             1
Position           1
Age                1
Height             1
Weight             1
College           85
Salary            12
dtype: int64


In [44]:
print(df.notnull().sum())

Permanent_name    457
Team              457
Number            457
Position          457
Age               457
Height            457
Weight            457
College           373
Salary            446
dtype: int64


In [45]:
df = df.fillna(0)
df

Unnamed: 0,Permanent_name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,0,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,0,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [46]:
data.replace(to_replace=np.nan, value=99)
data

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [47]:
import pandas as pd
import numpy as np

dict = {'First Score': [100, 90, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score': [52, 40, 80, 98],
        'Fourth Score': [np.nan, np.nan, np.nan, 65]}
df = pd.DataFrame(dict)

df = df.dropna()
df

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


In [48]:
dict = {'First Score': [100, np.nan, np.nan, 95],
        'Second Score': [30, np.nan, 45, 56],
        'Third Score': [52, np.nan, 80, 98],
        'Fourth Score': [np.nan, np.nan, np.nan, 65]}
df = pd.DataFrame(dict)

sa= df.dropna(axis = 0)
sa

Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98.0,65.0


In [49]:
# importing pandas as library
import pandas as pd

# creating data frame:
df = pd.DataFrame({'name': ['Akash', 'Ayush', 'Ashish',
                            'Diksha', 'Shivani'],

                   'Age': [21, 25, 23, 22, 18],

                   'MotherTongue': ['Hindi', 'English', 'Marathi',
                                    'Bhojpuri', 'Oriya']})

print("The original data frame")
df

The original data frame


Unnamed: 0,name,Age,MotherTongue
0,Akash,21,Hindi
1,Ayush,25,English
2,Ashish,23,Marathi
3,Diksha,22,Bhojpuri
4,Shivani,18,Oriya


In [50]:
print("Selecting Single column value using dataframe.column name")
series_one = pd.Series(df.Age)
print(series_one)

print("Type of selected one")
print(type(series_one))

Selecting Single column value using dataframe.column name
0    21
1    25
2    23
3    22
4    18
Name: Age, dtype: int64
Type of selected one
<class 'pandas.core.series.Series'>


In [51]:
# using [] method
print("Selecting Single column value using dataframe[column name]")
series_one = pd.Series(df['Age'])
print(series_one)

print("Type of selected one")
print(type(series_one))

Selecting Single column value using dataframe[column name]
0    21
1    25
2    23
3    22
4    18
Name: Age, dtype: int64
Type of selected one
<class 'pandas.core.series.Series'>


In [77]:
data

{'Name': ['Bob', 'Charlie', 'David'],
 'Age': [23, 45, 28],
 'Score': [90, 78, 88]}

In [80]:
filtered_df = df.query("Age > 25 and Weight >200")
filtered_df

Unnamed: 0,Permanent_name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
12,Evan Turner,Boston Celtics,11.0,SG,27.0,6-7,220.0,Ohio State,3425510.0
14,Tyler Zeller,Boston Celtics,44.0,C,26.0,7-0,253.0,North Carolina,2616975.0
...,...,...,...,...,...,...,...,...,...
450,Joe Ingles,Utah Jazz,2.0,SF,28.0,6-8,226.0,,2050000.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [53]:
filtered_df = data[data['Age']>30]
print(filtered_df)

                Name                    Team  Number Position   Age Height  \
19      Jarrett Jack           Brooklyn Nets     2.0       PG  32.0    6-3   
31      Lou Amundson         New York Knicks    17.0       PF  33.0    6-9   
33   Carmelo Anthony         New York Knicks     7.0       SF  32.0    6-8   
34     Jose Calderon         New York Knicks     3.0       PG  34.0    6-3   
43     Sasha Vujacic         New York Knicks    18.0       SG  32.0    6-7   
..               ...                     ...     ...      ...   ...    ...   
406  Tayshaun Prince  Minnesota Timberwolves    12.0       SF  36.0    6-9   
413    Nick Collison   Oklahoma City Thunder     4.0       PF  35.0   6-10   
415       Randy Foye   Oklahoma City Thunder     6.0       SG  32.0    6-4   
420    Nazr Mohammed   Oklahoma City Thunder    13.0        C  38.0   6-10   
434      Chris Kaman  Portland Trail Blazers    35.0        C  34.0    7-0   

     Weight           College      Salary  
19    200.0      Ge

In [54]:
filtered_df = data[data['College'] == 'Texas']
display(filtered_df)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
66,Cory Joseph,Toronto Raptors,6.0,PG,24.0,6-3,190.0,Texas,7000000.0
133,P.J. Tucker,Phoenix Suns,17.0,SF,31.0,6-6,245.0,Texas,5500000.0
179,Tristan Thompson,Cleveland Cavaliers,13.0,C,25.0,6-9,238.0,Texas,14260870.0
208,Myles Turner,Indiana Pacers,33.0,PF,20.0,6-11,243.0,Texas,2357760.0
289,Jordan Hamilton,New Orleans Pelicans,25.0,SG,25.0,6-7,220.0,Texas,1015421.0
294,LaMarcus Aldridge,San Antonio Spurs,12.0,PF,30.0,6-11,240.0,Texas,19689000.0
384,D.J. Augustin,Denver Nuggets,12.0,PG,28.0,6-0,183.0,Texas,3000000.0
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27.0,6-9,240.0,Texas,20158622.0


AttributeError: 'function' object has no attribute 'columns'

In [55]:
filtered_df = data.loc[data['Age']> 30]
filtered_df['Age']

Unnamed: 0,Age
19,32.0
31,33.0
33,32.0
34,34.0
43,32.0
...,...
406,36.0
413,35.0
415,32.0
420,38.0


In [56]:
import pandas as pd
data1 = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 32,45],
        'Score': [85, 90, 78]}
df = pd.DataFrame(data)

# Filter rows where Age is either 25 or 45
filtered_df = df[df['Age'].isin([25, 45])]
print(filtered_df)

                    Name                    Team  Number Position   Age  \
0          Avery Bradley          Boston Celtics     0.0       PG  25.0   
1            Jae Crowder          Boston Celtics    99.0       SF  25.0   
7           Kelly Olynyk          Boston Celtics    41.0        C  25.0   
26       Thomas Robinson           Brooklyn Nets    41.0       PF  25.0   
35      Cleanthony Early         New York Knicks    11.0       SF  25.0   
44      Derrick Williams         New York Knicks    23.0       PF  25.0   
47         Isaiah Canaan      Philadelphia 76ers     0.0       PG  25.0   
48      Robert Covington      Philadelphia 76ers    33.0       SF  25.0   
59       Hollis Thompson      Philadelphia 76ers    31.0       SG  25.0   
71         Terrence Ross         Toronto Raptors    31.0       SF  25.0   
79             Ian Clark   Golden State Warriors    21.0       SG  25.0   
105          C.J. Wilcox    Los Angeles Clippers    30.0       SG  25.0   
113           Ryan Kelly 

In [74]:
data

{'Name': ['Bob', 'Charlie', 'David'],
 'Age': [23, 45, 28],
 'Score': [90, 78, 88]}

In [65]:
filtered_df = df.query('Age > 30' and 'Weight < 30')
print(filtered_df)

Empty DataFrame
Columns: [Name, Team, Number, Position, Age, Height, Weight, College, Salary]
Index: []


In [72]:
import pandas as pd
data2 = {'Name': ['Bob', 'Charlie', 'David'],
        'Age': [23, 45, 28],
        'Score': [90, 78, 88]}
df = pd.DataFrame(data)

# AND operation: Age > 25 AND Score > 80
and_filter = df[(df['Age'] > 25) & (df['Score'] > 80)]
print("AND Operation Result:")
print(and_filter)

# OR operation: Age > 25 OR Score > 80
or_filter = df[(df['Age'] > 25) | (df['Score'] > 80)]
print("\nOR Operation Result:")
print(or_filter)

# NOT operation: NOT (Age > 25)
not_filter = df[~(df['Age'] > 25)]
print("\nNOT Operation Result:")
print(not_filter)

AND Operation Result:
    Name  Age  Score
2  David   28     88

OR Operation Result:
      Name  Age  Score
0      Bob   23     90
1  Charlie   45     78
2    David   28     88

NOT Operation Result:
  Name  Age  Score
0  Bob   23     90


In [97]:
data.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [83]:
df

Unnamed: 0,Permanent_name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [98]:
# Given a dataset:
# ● Replace "N/A" and "?" with np.nan.
# ● Drop columns with >50% nulls.
# ● Fill remaining nulls with mean (for numeric) or mode (for categorical).




In [99]:
df

In [102]:
data = pd.read_csv('nba.csv')
data

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [103]:

import numpy as np
data = data.isnull()
print(data[data])


replaced_Nan = data.replace({'NaN',np.nan})
replaced_Nan

     Name  Team Number Position   Age Height Weight College Salary
0     NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN    NaN
1     NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN    NaN
2     NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN   True
3     NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN    NaN
4     NaN   NaN    NaN      NaN   NaN    NaN    NaN    True    NaN
..    ...   ...    ...      ...   ...    ...    ...     ...    ...
453   NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN    NaN
454   NaN   NaN    NaN      NaN   NaN    NaN    NaN    True    NaN
455   NaN   NaN    NaN      NaN   NaN    NaN    NaN    True    NaN
456   NaN   NaN    NaN      NaN   NaN    NaN    NaN     NaN    NaN
457  True  True   True     True  True   True   True    True   True

[458 rows x 9 columns]


  replaced_Nan = data.replace({'NaN',np.nan})


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...
453,False,False,False,False,False,False,False,False,False
454,False,False,False,False,False,False,False,True,False
455,False,False,False,False,False,False,False,True,False
456,False,False,False,False,False,False,False,False,False


In [105]:
!unzip sahil.zip

Archive:  sahil.zip
   creating: Data _Aggregation/
  inflating: Data _Aggregation/anscombe.json  
  inflating: Data _Aggregation/california_housing_test.csv  
  inflating: Data _Aggregation/california_housing_train.csv  
  inflating: Data _Aggregation/file_example_XLS_100 (1).xls  
  inflating: Data _Aggregation/file_example_XLS_100.xls  
  inflating: Data _Aggregation/file_example_XLS_50.xls  
  inflating: Data _Aggregation/mnist_test.csv  
  inflating: Data _Aggregation/mnist_train_small.csv  
  inflating: Data _Aggregation/nba.csv  
  inflating: Data _Aggregation/sample1.json  
  inflating: Data _Aggregation/sample2 (1).json  
  inflating: Data _Aggregation/sample2.json  
  inflating: Data _Aggregation/sample3.json  


In [116]:
import glob
import os

folder_path = '/content/Data _Aggregation'


csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Print the list of CSV files found
print(csv_files)

['/content/Data _Aggregation/california_housing_train.csv', '/content/Data _Aggregation/mnist_test.csv', '/content/Data _Aggregation/california_housing_test.csv', '/content/Data _Aggregation/mnist_train_small.csv', '/content/Data _Aggregation/nba.csv']


In [112]:
import pandas as pd
import numpy as np

print(csv_files)

dfs = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)

merged_df.to_csv('merged_data.csv', index=False) # index=False prevents writing the DataFrame index as a column


df =pd.read_csv('merged_data.csv')

['/content/Data _Aggregation/california_housing_train.csv', '/content/Data _Aggregation/mnist_test.csv', '/content/Data _Aggregation/california_housing_test.csv', '/content/Data _Aggregation/mnist_train_small.csv', '/content/Data _Aggregation/nba.csv']


  df =pd.read_csv('merged_data.csv')


In [140]:
df2 = pd.read_csv('merged_data.csv')
df2

  df2 = pd.read_csv('merged_data.csv')


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,7,...,29,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,,...,,,,,,,,,,
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0,,...,,,,,,,,,,
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,,...,,,,,,,,,,
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,,...,,,,,,,,,,
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50451,,,,,,,,,,,...,,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
50452,,,,,,,,,,,...,,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
50453,,,,,,,,,,,...,,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
50454,,,,,,,,,,,...,,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [141]:
df2.shape

(50456, 966)

In [203]:
# Given a dataset:
# ● Replace "N/A" and "?" with np.nan.
# ● Drop columns with >50% nulls.e
# ● Fill remaining nulls with mean (for numeric) or mode (for categorical).


df2
null_counts = df2.isnull().sum()

total_rows = len(df2)
total_rows

null_percentages = (null_counts / total_rows) * 100
null_percentages

columns_with_high_nulls = null_percentages[null_percentages >= 50].index
columns_with_high_nulls

drop_columns = df2.drop(columns = columns_with_high_nulls, axis = 1)
df2 = drop_columns
df2

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
17000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,179.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:
# Fill remaining nulls with mean (for numeric) or mode (for categorical).

columns_with_remaining_nulls = null_percentages[null_percentages < 50]
columns_with_remaining_nulls


fill_remaining_nulls = df2.fillna(df2.mean())
fill_remaining_nulls


left_columns = fill_remaining_nulls.isnull().sum()
left_columns

Unnamed: 0,0


Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
17000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,179.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
data_nba = pd.read_csv('nba.csv')
data_nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [228]:
data_nba['Name'] = data_nba['Name'].astype('category')
data_nba['Team'] = data_nba['Team'].astype('category')
data_nba['Position'] = data_nba['Position'].astype('category')


categorical_cols = data_nba.dtypes[data_nba.dtypes == 'category'].index.tolist()
print(f"Categorical Columns: {categorical_cols}")

numerical_cols = data_nba.select_dtypes(include=['number']).columns.tolist()
print(f"Numerical Columns: {numerical_cols}")

Categorical Columns: ['Name', 'Team', 'Position']
Numerical Columns: ['Number', 'Age', 'Weight', 'Salary']


In [137]:
data_values  = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 'N/A',45],
        'Score': [85, '?', 78]}
df1 = pd.DataFrame(data_values)


replace_items = df1.replace(['N/A','?' ],np.nan)
print(data_values)
print(replace_items)



{'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 'N/A', 45], 'Score': [85, '?', 78]}
      Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN    NaN
2  Charlie  45.0   78.0


  replace_items = df1.replace(['N/A','?' ],np.nan)
