In [1]:
path = "https://raw.githubusercontent.com/shobhit-nigam/knowledgeclan/main/datasets/cars.csv"

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
dfa = pd.read_csv(path)

In [4]:
dfa.shape

(392, 8)

In [5]:
list(dfa.columns)

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'year',
 'origin']

In [6]:
dfa.head(4)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1


In [7]:
dfb = dfa.sample(frac=1)

In [8]:
dfc = pd.get_dummies(dfb, columns=["year","cylinders"])

In [9]:
dfc.columns

Index(['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin',
       'year_70', 'year_71', 'year_72', 'year_73', 'year_74', 'year_75',
       'year_76', 'year_77', 'year_78', 'year_79', 'year_80', 'year_81',
       'year_82', 'cylinders_3', 'cylinders_4', 'cylinders_5', 'cylinders_6',
       'cylinders_8'],
      dtype='object')

In [46]:
392*0.7

274.4

In [10]:
train = dfc.iloc[0:275]

In [12]:
test = dfc.iloc[275:]

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
unique_origins = dfc["origin"].unique()
unique_origins.sort()

In [15]:
models = {}
features = [x for x in train.columns if x.startswith("cyl") or x.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    y_train = train["origin"] == origin
    
    model.fit(X_train, y_train)
    models[origin] = model

In [16]:
testing_probs = pd.DataFrame(columns=unique_origins)

for origin in unique_origins:
    
    X_test = test[features]
    
    testing_probs[origin] = models[origin].predict_proba(X_test)[:, 1]

In [17]:
testing_probs

Unnamed: 0,1,2,3
0,0.875131,0.038695,0.087845
1,0.975196,0.008929,0.035589
2,0.459905,0.193428,0.326725
3,0.865341,0.053574,0.069196
4,0.240575,0.313030,0.453122
...,...,...,...
112,0.364838,0.376336,0.245180
113,0.459905,0.193428,0.326725
114,0.895417,0.026387,0.100775
115,0.369426,0.157330,0.461486


In [18]:
predicted_origin = testing_probs.idxmax(axis=1)

predicted_origin

0      1
1      1
2      1
3      1
4      3
      ..
112    2
113    1
114    1
115    3
116    1
Length: 117, dtype: int64

In [20]:
unique_origins

array([1, 2, 3])

In [19]:
test.columns

Index(['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin',
       'year_70', 'year_71', 'year_72', 'year_73', 'year_74', 'year_75',
       'year_76', 'year_77', 'year_78', 'year_79', 'year_80', 'year_81',
       'year_82', 'cylinders_3', 'cylinders_4', 'cylinders_5', 'cylinders_6',
       'cylinders_8'],
      dtype='object')

In [24]:
test["predicted_origin"] = list(predicted_origin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
test

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,year_70,year_71,year_72,year_73,...,year_79,year_80,year_81,year_82,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,predicted_origin
35,19.0,250.0,88.0,3302.0,15.5,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
74,14.0,318.0,150.0,4077.0,14.0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
214,31.5,98.0,68.0,2045.0,18.5,3,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
192,22.5,232.0,90.0,3085.0,17.6,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
316,29.8,134.0,90.0,2711.0,15.5,3,0,0,0,0,...,0,1,0,0,0,1,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,19.0,120.0,88.0,3270.0,21.9,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2
231,29.0,97.0,78.0,1940.0,14.5,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
257,20.8,200.0,85.0,3070.0,16.7,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
345,34.1,91.0,68.0,1985.0,16.0,3,0,0,0,0,...,0,0,1,0,0,1,0,0,0,3


In [27]:
len(test[test['origin'] == test['predicted_origin']])

78

In [28]:
78/117

0.6666666666666666

In [29]:
dfa.corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
mpg,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541,0.565209
cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,-0.568932
displacement,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,-0.614535
horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,-0.455171
weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,-0.585005
acceleration,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,0.212746
year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,0.181528
origin,0.565209,-0.568932,-0.614535,-0.455171,-0.585005,0.212746,0.181528,1.0


In [30]:
# screen size 75
# dimensions 

# LIFE 