
# TEJ SAI PRANAV REDDY KAGITALA - TXK220023


# General accidents

In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Load the dataset into a Pandas dataframe
df = pd.read_csv('general_accidents.csv')

# 1. Initial data exploration and analysis
print(df.describe()) # Basic statistics summary for all columns
print(df.isnull().sum()) # Check for missing values
print(df['Severity'].value_counts()) # Count of different values in the Severity column

# 2. Create the PCA model, separate train and test data (30% Train, 70% Test)
X = df.drop(['Severity', 'StartTime', 'EndTime', 'Street', 'City', 'State', 'Zipcode', 'WeatherCondition'], axis=1)
y = df['Severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

# 3. Train the model
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)

# 4. Find the ideal number of components to be used so you can have a C.E.V. >= .85
n_components = 1
cev = 0.0
while cev < 0.85:
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    cev = sum(pca.explained_variance_ratio_)
    n_components += 1

print(f"Ideal number of components with CEV >= 0.85: {n_components-1}")

# 5. Use the test data and calculate the accuracy for 3 different number of components, showing improvement on the results as you approach a higher C.E.V.
for n in [1, 5, 10]:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train a simple logistic regression model
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=42).fit(X_train_pca, y_train)

    y_pred = clf.predict(X_test_pca)
    print(f"Accuracy score with {n} components: {accuracy_score(y_test, y_pred)}")

            Severity       StartLat       StartLng       Distance  \
count  100000.000000  100000.000000  100000.000000  100000.000000   
mean        2.338270      36.514226     -95.918245       0.288088   
std         0.551132       4.895905      17.357249       1.653184   
min         1.000000      24.569300    -124.474380       0.000000   
25%         2.000000      33.622412    -117.488407       0.000000   
50%         2.000000      35.840891     -91.074997       0.000000   
75%         3.000000      40.300162     -80.995310       0.010000   
max         4.000000      48.964230     -68.368760     176.279999   

             Zipcode    Temperature       Humidity       Pressure  \
count  100000.000000  100000.000000  100000.000000  100000.000000   
mean    57969.864520      61.727966      64.828860      29.709953   
std     30456.072213      18.970448      23.154256       1.324756   
min      1001.000000     -27.900000       0.000000       0.000000   
25%     29704.000000      50.0000

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy score with 5 components: 0.6789428571428572
Accuracy score with 10 components: 0.6801571428571429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Movie ratings dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Load the dataset into a Pandas dataframe
df = pd.read_csv('movies_ratings.csv')

# 1. Initial data exploration and analysis
print(df.describe()) # Basic statistics summary for all columns
print(df.isnull().sum()) # Check for missing values
print(df['Rating'].value_counts()) # Count of different values in the rating column

# 2. Create the PCA model, separate train and test data (30% Train, 70% Test)
X = df.drop(['Title', 'Rating', 'MovieID', 'UserID', 'Age', 'Gender', 'Occupation', 'Zipcode'], axis=1)
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

# 3. Train the model
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)

# 4. Find the ideal number of components to be used so you can have a C.E.V. >= .85
n_components = 1
cev = 0.0
while cev < 0.85:
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    cev = sum(pca.explained_variance_ratio_)
    n_components += 1

print(f"Ideal number of components with CEV >= 0.85: {n_components-1}")

# 5. Use the test data and calculate the accuracy for 3 different number of components, showing improvement on the results as you approach a higher C.E.V.
for n in [1, 5, 10]:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train a simple logistic regression model
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=42).fit(X_train_pca, y_train)

    y_pred = clf.predict(X_test_pca)
    print(f"Accuracy score with {n} components: {accuracy_score(y_test, y_pred)}")

             Action     Adventure     Animation      Children        Comedy  \
count  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06   
mean   2.574032e-01  1.339250e-01  4.328395e-02  7.217092e-02  3.565055e-01   
std    4.372036e-01  3.405719e-01  2.034957e-01  2.587708e-01  4.789672e-01   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
75%    1.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00   
max    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00   

              Crime   Documentary         Drama       Fantasy     Film-Noir  \
count  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06  1.000209e+06   
mean   7.952438e-02  7.908347e-03  3.544549e-01  3.629341e-02  1.825718e-02   
std    2.705556e-01  8.857659e-02  4.783481e-01  1.

KeyError: "['Title', 'MovieID', 'UserID'] not found in axis"

# music popularity

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 1. Perform the initial data exploration and analysis
df = pd.read_csv('music_popularity.csv')

# Check for missing values
print(df.isnull().sum())

# Check the data types of columns
print(df.dtypes)

# Summary statistics of the dataset
print(df.describe())

# Correlation matrix of the dataset
print(df.corr())

# 2. Create the PCA model, separate train and test data (30% Train, 70% Test)
X = df.drop(['track_id', 'track_name', 'track_artist', 'track_album_id', 'track_album_name', 'track_album_release_date', 'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre', 'track_popularity'], axis=1)
y = df['track_popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 3. Train the model
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
lr = LinearRegression()
lr.fit(X_train_pca, y_train)

# 4. Find the ideal number of components to be used so you can have a C.E.V. >= .85
n_components = X_train.shape[1]
for i in range(1, n_components+1):
    pca = PCA(n_components=i)
    X_train_pca = pca.fit_transform(X_train_std)
    cev = np.sum(pca.explained_variance_ratio_)
    if cev >= 0.85:
        print("Ideal number of components:", i)
        print("Cumulative explained variance ratio:", cev)
        break

# 5. Use the test data and calculate the accuracy for 3 different number of components, showing improvement on the results as you approach a higher C.E.V.
n_components_list = [2, 4, 6]
for n in n_components_list:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train_std)
    X_test_pca = pca.transform(X_test_std)
    lr = LinearRegression()
    lr.fit(X_train_pca, y_train)
    y_pred = lr.predict(X_test_pca)
    r2 = r2_score(y_test, y_pred)
    print("Number of components:", n)
    print("R-squared score:", r2)

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64
track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_

# price of autos dataset

In [5]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# load data
df = pd.read_csv('price_of_autos.csv')

# 1. Perform the initial data exploration and analysis
print(df.head())
print(df.describe())

# 2. Create the PCA model, separate train and test data (30% Train, 70% Test)
X = df.drop(['price'], axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Train the model
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr = LinearRegression()
lr.fit(X_train_pca, y_train)

# 4. Find the ideal number of components to be used so you can have a C.E.V. >= .85
cev = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cev >= 0.85) + 1
print(f"Ideal number of components: {n_components}")

# 5. Use the test data and calculate the accuracy for 3 different number of components,
#    showing improvement on the results as you approach a higher C.E.V.
for n in [1, 5, n_components]:
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    lr = LinearRegression()
    lr.fit(X_train_pca, y_train)
    y_pred = lr.predict(X_test_pca)
    r2 = r2_score(y_test, y_pred)
    print(f"Number of components: {n}, R^2 score: {r2}")


   symboling         make fuel_type aspiration  num_of_doors   body_style  \
0          3  alfa-romero       gas        std             2  convertible   
1          3  alfa-romero       gas        std             2  convertible   
2          1  alfa-romero       gas        std             2    hatchback   
3          2         audi       gas        std             4        sedan   
4          2         audi       gas        std             4        sedan   

  drive_wheels engine_location  wheel_base  length  ...  engine_size  \
0          rwd           front        88.6   168.8  ...          130   
1          rwd           front        88.6   168.8  ...          130   
2          rwd           front        94.5   171.2  ...          152   
3          fwd           front        99.8   176.6  ...          109   
4          4wd           front        99.4   176.6  ...          136   

   fuel_system  bore stroke  compression_ratio  horsepower peak_rpm  city_mpg  \
0         mpfi  3.47   

ValueError: could not convert string to float: 'bmw'