In [49]:
# Assingment Numpy | Pandas
# Numpy

In [50]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('auto-mpg.csv')
print(df.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [51]:
# 1. Basic Array Operations
mpg_array = np.array(df['mpg'])
mean_mpg = np.mean(mpg_array)
median_mpg = np.median(mpg_array)
std_mpg = np.std(mpg_array)
cars_above_25 = np.sum(mpg_array > 25)

print(f"Mean: {mean_mpg:.2f}")
print(f"Median: {median_mpg:.2f}")
print(f"Standard Deviation: {std_mpg:.2f}")
print(f"Cars with MPG > 25: {cars_above_25}")

Mean: 23.51
Median: 23.00
Standard Deviation: 7.81
Cars with MPG > 25: 158


In [52]:
# 2. Filtering
cylinders_array = np.array(df['cylinders'])
car_names = np.array(df['car name'])
cars_more_than_6_cylinders = car_names[cylinders_array > 6]
print("Cars with more than 6 cylinders:", cars_more_than_6_cylinders.tolist()[:10])

Cars with more than 6 cylinders: ['chevrolet chevelle malibu', 'buick skylark 320', 'plymouth satellite', 'amc rebel sst', 'ford torino', 'ford galaxie 500', 'chevrolet impala', 'plymouth fury iii', 'pontiac catalina', 'amc ambassador dpl']


In [53]:
# 3. Statistical Analysis
weight_array = np.array(df['weight'])
percentile_25 = np.percentile(weight_array, 25)
percentile_50 = np.percentile(weight_array, 50)
percentile_75 = np.percentile(weight_array, 75)
print(f"25th percentile: {percentile_25:.2f}")
print(f"50th percentile: {percentile_50:.2f}")
print(f"75th percentile: {percentile_75:.2f}")

25th percentile: 2223.75
50th percentile: 2803.50
75th percentile: 3608.00


In [54]:
# 4. Array Manipulation
acceleration_array = np.array(df['acceleration'])
normalized_acceleration = (acceleration_array - np.min(acceleration_array)) / (np.max(acceleration_array) - np.min(acceleration_array))
print("Normalized acceleration (first 10):", normalized_acceleration[:10])

Normalized acceleration (first 10): [0.23809524 0.20833333 0.17857143 0.23809524 0.14880952 0.11904762
 0.05952381 0.0297619  0.11904762 0.0297619 ]


In [55]:
# 5. Broadcasting - Handle missing horsepower values (shown as '?')
horsepower_array = np.array(df['horsepower'])
# Convert '?' to NaN and handle missing values
horsepower_numeric = np.where(horsepower_array == '?', np.nan, horsepower_array)
horsepower_numeric = horsepower_numeric.astype(float)
mean_horsepower = np.nanmean(horsepower_numeric)
horsepower_filled = np.nan_to_num(horsepower_numeric, nan=mean_horsepower)
horsepower_increased = horsepower_filled * 1.10
print("Horsepower increased by 10% (first 10):", horsepower_increased[:10])

Horsepower increased by 10% (first 10): [143.  181.5 165.  165.  154.  217.8 242.  236.5 247.5 209. ]


In [56]:
# 6. Boolean Indexing
origin_array = np.array(df['origin'])
displacement_array = np.array(df['displacement'])
european_cars_displacement = displacement_array[origin_array == 2]
avg_displacement_europe = np.mean(european_cars_displacement)
print(f"Average displacement for European cars: {avg_displacement_europe:.2f}")

Average displacement for European cars: 109.14


In [57]:
# 7. Matrix Operations
matrix = np.column_stack((mpg_array, horsepower_filled, weight_array))
vector = np.array([1, 0.5, -0.2])
dot_product = np.dot(matrix, vector)
print("Dot product result (first 10):", dot_product[:10])

Dot product result (first 10): [-617.8 -641.1 -594.2 -595.6 -602.8 -754.2 -746.8 -740.9 -758.5 -660. ]


In [58]:
# 8. Sorting
model_year_array = np.array(df['model year'])
sorted_indices = np.argsort(model_year_array)[::-1]  # Descending order
sorted_car_names = car_names[sorted_indices]
print("Top 5 cars by model year (descending):", sorted_car_names[:5])

Top 5 cars by model year (descending): ['dodge aries se' 'pontiac phoenix' 'pontiac j2000 se hatchback'
 'chevrolet cavalier 2-door' 'chevrolet cavalier wagon']


In [59]:
# 9. Correlation
correlation = np.corrcoef(mpg_array, weight_array)[0, 1]
print(f"Correlation between MPG and weight: {correlation:.2f}")

Correlation between MPG and weight: -0.83


In [60]:
# 10. Conditional Aggregates
cylinders_unique = np.unique(cylinders_array)
print("Mean MPG by cylinders:")
for cyl in cylinders_unique:
    mpg_for_cyl = mpg_array[cylinders_array == cyl]
    mean_mpg_cyl = np.mean(mpg_for_cyl)
    print(f"  {cyl} cylinders: {mean_mpg_cyl:.2f}")

Mean MPG by cylinders:
  3 cylinders: 20.55
  4 cylinders: 29.29
  5 cylinders: 27.37
  6 cylinders: 19.99
  8 cylinders: 14.96


In [61]:
#Pandas

In [62]:
# 1. Basic Exploration
print("First 10 rows:")
print(df.head(10))
print(f"\nDataset shape: {df.shape}")
print("\nSummary statistics:")
print(df.describe())

First 10 rows:
    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   
5  15.0          8         429.0        198    4341          10.0          70   
6  14.0          8         454.0        220    4354           9.0          70   
7  14.0          8         440.0        215    4312           8.5          70   
8  14.0          8         455.0        225    4425          10.0          70   
9  15.0          8         390.0        190    3850           8.5          70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1      

In [63]:
# 2. Filtering and Indexing
filtered_cars = df[(df['model year'] == 75) & (df['weight'] < 3000)]
selected_columns = filtered_cars[['car name', 'weight', 'mpg']]
print("Cars from 1975 with weight < 3000:")
print(selected_columns)

Cars from 1975 with weight < 3000:
              car name  weight   mpg
167     toyota corolla    2171  29.0
168         ford pinto    2639  23.0
169        amc gremlin    2914  20.0
170      pontiac astro    2592  23.0
171      toyota corona    2702  24.0
172  volkswagen dasher    2223  25.0
173         datsun 710    2545  24.0
174         ford pinto    2984  18.0
175  volkswagen rabbit    1937  29.0
177         audi 100ls    2694  23.0
178        peugeot 504    2957  23.0
179        volvo 244dl    2945  22.0
180          saab 99le    2671  25.0
181   honda civic cvcc    1795  33.0


In [64]:
# 3. Handling Missing Data
print("Missing values in each column:")
print(df.isnull().sum())

# Handle horsepower missing values (convert '?' to NaN first)
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
print(f"\nMissing horsepower values after conversion: {df['horsepower'].isnull().sum()}")

if df['horsepower'].isnull().any():
    median_horsepower = df['horsepower'].median()
    df['horsepower'] = df['horsepower'].fillna(median_horsepower)
    print(f"Replaced missing horsepower with median: {median_horsepower}")

Missing values in each column:
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

Missing horsepower values after conversion: 6
Replaced missing horsepower with median: 93.5


In [65]:
# 4. Data Transformation
df['power_to_weight_ratio'] = df['horsepower'] / df['weight']
print("DataFrame with new power_to_weight_ratio column:")
print(df[['car name', 'horsepower', 'weight', 'power_to_weight_ratio']].head())

DataFrame with new power_to_weight_ratio column:
                    car name  horsepower  weight  power_to_weight_ratio
0  chevrolet chevelle malibu       130.0    3504               0.037100
1          buick skylark 320       165.0    3693               0.044679
2         plymouth satellite       150.0    3436               0.043655
3              amc rebel sst       150.0    3433               0.043694
4                ford torino       140.0    3449               0.040591


In [66]:
# 5. Group By
mpg_by_origin = df.groupby('origin')['mpg'].mean()
print("Average MPG by origin:")
print(mpg_by_origin)

Average MPG by origin:
origin
1    20.083534
2    27.891429
3    30.450633
Name: mpg, dtype: float64


In [67]:
# 6. Sorting
top_mpg_cars = df.sort_values('mpg', ascending=False).head(10)
print("Top 10 cars by MPG:")
print(top_mpg_cars[['car name', 'mpg']])

Top 10 cars by MPG:
                            car name   mpg
322                        mazda glc  46.6
329              honda civic 1500 gl  44.6
325             vw rabbit c (diesel)  44.3
394                        vw pickup  44.0
326               vw dasher (diesel)  43.4
244  volkswagen rabbit custom diesel  43.1
309                        vw rabbit  41.5
330             renault lecar deluxe  40.9
324                       datsun 210  40.8
247                   datsun b210 gx  39.4


In [68]:
# 7. Apply Function
def performance_score(row):
    return row['mpg'] * row['acceleration'] / row['weight']

df['performance_score'] = df.apply(performance_score, axis=1)
print("DataFrame with performance_score:")
print(df[['car name', 'mpg', 'acceleration', 'weight', 'performance_score']].head())

DataFrame with performance_score:
                    car name   mpg  acceleration  weight  performance_score
0  chevrolet chevelle malibu  18.0          12.0    3504           0.061644
1          buick skylark 320  15.0          11.5    3693           0.046710
2         plymouth satellite  18.0          11.0    3436           0.057625
3              amc rebel sst  16.0          12.0    3433           0.055928
4                ford torino  17.0          10.5    3449           0.051754


In [69]:
# 8. Visualization Preparation
yearly_summary = df.groupby('model year').agg({
    'mpg': 'mean',
    'weight': 'mean',
    'horsepower': 'mean'
}).reset_index()
print("Yearly summary:")
print(yearly_summary)

Yearly summary:
    model year        mpg       weight  horsepower
0           70  17.689655  3372.793103  147.827586
1           71  21.250000  2995.428571  106.553571
2           72  18.714286  3237.714286  120.178571
3           73  17.100000  3419.025000  130.475000
4           74  22.703704  2877.925926   94.203704
5           75  20.266667  3176.800000  101.066667
6           76  21.573529  3078.735294  101.117647
7           77  23.375000  2997.357143  105.071429
8           78  24.061111  2861.805556   99.694444
9           79  25.093103  3055.344828  101.206897
10          80  33.696552  2436.655172   78.586207
11          81  30.334483  2522.931034   81.465517
12          82  31.709677  2453.548387   81.854839


In [70]:
# 9. Exporting Data
high_mpg_cars = df[df['mpg'] > 30][['mpg', 'cylinders', 'horsepower', 'weight']]
high_mpg_cars.to_csv('high_mpg_cars.csv', index=False)
print("High MPG cars saved to high_mpg_cars.csv")

High MPG cars saved to high_mpg_cars.csv


In [71]:
# 10. Finding Anomalies
Q1 = df['mpg'].quantile(0.25)
Q3 = df['mpg'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

mpg_outliers = df[(df['mpg'] < lower_bound) | (df['mpg'] > upper_bound)]
outlier_cars = mpg_outliers[['car name', 'mpg', 'model year']]
print(f"MPG Outliers (IQR method):")
print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
print("Outlier cars:")
print(outlier_cars)

MPG Outliers (IQR method):
Lower bound: 0.25, Upper bound: 46.25
Outlier cars:
      car name   mpg  model year
322  mazda glc  46.6          80
