### Importing necessary Libraries

In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px

In [2]:
df = pd.read_csv(zipfile.ZipFile('SHL_Hiring_Asessment (5).zip').open('cv-invalid.csv'))
df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-invalid/sample-000000.mp3,revenge is not my style but obviously accident...,1,10,,,,
1,cv-invalid/sample-000001.mp3,it was bunched up and he had hardly thought of...,0,2,twenties,male,us,
2,cv-invalid/sample-000002.mp3,then suddenly he noticed it with a start,10,4,thirties,female,canada,
3,cv-invalid/sample-000003.mp3,that's the point at which most people give up,0,1,,,,
4,cv-invalid/sample-000004.mp3,you got someplace to sleep,0,1,,,,
...,...,...,...,...,...,...,...,...
25398,cv-invalid/sample-025398.mp3,well then we've got a problem,0,4,,,,
25399,cv-invalid/sample-025399.mp3,the boy was surprised at his thoughts,0,6,,,,
25400,cv-invalid/sample-025400.mp3,undefined,1,2,,,,
25401,cv-invalid/sample-025401.mp3,but there was something there in his heart tha...,1,5,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25403 entries, 0 to 25402
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   filename    25403 non-null  object 
 1   text        25403 non-null  object 
 2   up_votes    25403 non-null  int64  
 3   down_votes  25403 non-null  int64  
 4   age         5851 non-null   object 
 5   gender      5868 non-null   object 
 6   accent      5008 non-null   object 
 7   duration    0 non-null      float64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.6+ MB


In [4]:
df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-invalid/sample-000000.mp3,revenge is not my style but obviously accident...,1,10,,,,
1,cv-invalid/sample-000001.mp3,it was bunched up and he had hardly thought of...,0,2,twenties,male,us,
2,cv-invalid/sample-000002.mp3,then suddenly he noticed it with a start,10,4,thirties,female,canada,
3,cv-invalid/sample-000003.mp3,that's the point at which most people give up,0,1,,,,
4,cv-invalid/sample-000004.mp3,you got someplace to sleep,0,1,,,,


In [5]:
df.shape

(25403, 8)

In [6]:
df.isnull().sum()

filename          0
text              0
up_votes          0
down_votes        0
age           19552
gender        19535
accent        20395
duration      25403
dtype: int64

In [8]:
# Check for missing values that are greater that 0 
[features for features in df.columns if df[features].isnull().sum()>0] 

['age', 'gender', 'accent', 'duration']

In [9]:
df.drop(columns=['duration','age','gender','accent'], inplace=True)
print('Cleaned Data:')
print(df.head())

Cleaned Data:
                       filename  \
0  cv-invalid/sample-000000.mp3   
1  cv-invalid/sample-000001.mp3   
2  cv-invalid/sample-000002.mp3   
3  cv-invalid/sample-000003.mp3   
4  cv-invalid/sample-000004.mp3   

                                                text  up_votes  down_votes  
0  revenge is not my style but obviously accident...         1          10  
1  it was bunched up and he had hardly thought of...         0           2  
2           then suddenly he noticed it with a start        10           4  
3      that's the point at which most people give up         0           1  
4                         you got someplace to sleep         0           1  


In [10]:
df.keys()

Index(['filename', 'text', 'up_votes', 'down_votes'], dtype='object')

### Plotting a line graph

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(df[['up_votes']], df['down_votes'])

# predicting the values
df['y_pred'] = model.predict(df[['up_votes']])

# Visualizing the results using plotly
fig = px.scatter(df, x='up_votes', y='down_votes', opacity=0.45, 
                 labels={'up_votes': 'Up Votes', 'down_votes': 'Down Votes'},
                 title='Best fit line for line plot')

# Adding the regression line to the plot
fig.add_scatter(x = df['up_votes'], y = df['y_pred'], mode='lines', name = 'Best fit line')
fig.show()

### Feature Extraction with Librosa

In [None]:
import librosa
from joblib import Parallel, delayed
import language_tool_python

# --- Configuration ---
ZIP_PATH = "SHL_Hiring_Asessment (5).zip"
CSV_IN_ZIP = "cv-invalid.csv"
OUTPUT_FILE = "audio_features.csv"
BATCH_SIZE = 500  # Audio feature batch size

# --- Load and Fix Paths ---
with zipfile.ZipFile(ZIP_PATH) as z:
    with z.open(CSV_IN_ZIP) as f:
        df = pd.read_csv(f)

    df['filename'] = df['filename'].str.replace(
        'cv-invalid/',
        'cv-valid-train/cv-valid-train/',
        regex=False
    )
    zip_files = set(z.namelist())
    df = df[df['filename'].isin(zip_files)].copy()

print(f"✅ Found {len(df)} valid files to process")

# --- Parallel Grammar Score Function ---
def calc_grammar(texts):
    tool = language_tool_python.LanguageTool('en-US')
    results = []
    for text in texts:
        try:
            score = max(0.1, 1 - len(tool.check(text)) / max(1, len(text.split())))
        except Exception:
            score = 0.1
        results.append(score)
    return results

# --- Apply Grammar Score in Parallel ---
if 'grammar_score' not in df.columns:
    print("Calculating grammar scores in parallel...")

    texts = df['text'].tolist()
    num_jobs = os.cpu_count()
    chunk_size = len(texts) // num_jobs

    text_chunks = [texts[i:i+chunk_size] for i in range(0, len(texts), chunk_size)]

    grammar_scores = Parallel(n_jobs=num_jobs)(
        delayed(calc_grammar)(chunk) for chunk in tqdm(text_chunks, desc="Scoring grammar")
    )

    df['grammar_score'] = [score for sublist in grammar_scores for score in sublist]
    df = df.dropna(subset=['grammar_score'])

# --- Audio Feature Extraction ---
def extract_features(file_path_in_zip):
    try:
        with zipfile.ZipFile(ZIP_PATH) as z:
            with z.open(file_path_in_zip) as f:
                y, sr = librosa.load(f, sr=22050, res_type='kaiser_fast')

                features = {
                    'filename': file_path_in_zip,
                    **{f'mfcc_{i}_mean': np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)[i])
                       for i in range(13)},
                    'zcr_mean': np.mean(librosa.feature.zero_crossing_rate(y))
                }
                return features
    except Exception as e:
        print(f"❌ Error processing {file_path_in_zip}: {str(e)}")
        return None

# --- Continue if already exists ---
processed_files = set()
if os.path.exists(OUTPUT_FILE):
    processed_files = set(pd.read_csv(OUTPUT_FILE)['filename'])
    print(f"🔄 Resuming from {len(processed_files)} processed files")

remaining = [row for _, row in df.iterrows()
             if row['filename'] not in processed_files]

for i in tqdm(range(0, len(remaining), BATCH_SIZE), desc="🎧 Extracting audio features", unit="batch"):
    batch = remaining[i:i + BATCH_SIZE]

    results = Parallel(n_jobs=-1)(
        delayed(extract_features)(row['filename']) for row in batch
    )

    successful = [r for r in results if r is not None]
    if successful:
        successful_df = pd.DataFrame(successful)
        successful_df['grammar_score'] = [row['grammar_score'] for row in batch[:len(successful)]]
        successful_df.to_csv(OUTPUT_FILE, mode='a',
                             header=not os.path.exists(OUTPUT_FILE),
                             index=False)

print("Feature extraction complete!")

✅ Found 25403 valid files to process
Calculating grammar scores in parallel...


Scoring grammar: 100%|██████████| 5/5 [00:00<00:00, 15.21it/s]


In [11]:
df_check = pd.read_csv('audio_features.csv')
df_check.shape
df_check.head()

Unnamed: 0,filename,grammar_score,mfcc_0_mean,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,mfcc_11_mean,mfcc_12_mean,zcr_mean
