In [1]:
import math

class Vector:
    def __init__(self, components):
        self.components = components
        
    def __str__(self):
        return f"Vector({self.components})"
    
    def length(self):
        return math.sqrt(sum(x**2 for x in self.components))
    
    @staticmethod
    def dot_product(v1, v2):
        return sum(x * y for x, y in zip(v1.components, v2.components))
    
    @staticmethod
    def cosine_similarity(v1, v2):
        return Vector.dot_product(v1, v2) / (v1.length() * v2.length())
    
    @staticmethod
    def euclidean_distance(v1, v2):
        return math.sqrt(sum((x - y)**2 for x, y in zip(v1.components, v2.components)))
    
    # Additional method to validate if the other vector is compatible for operations
    @staticmethod
    def validate(v1, v2):
        if len(v1.components) != len(v2.components):
            raise ValueError("Vectors must be of the same dimension.")

In [2]:
v1 = Vector([1, 2, 3])
v2 = Vector([4, 5, 6])

print(f"Length of v1: {v1.length()}")
print(f"Length of v2: {v2.length()}")

try:
    Vector.validate(v1, v2)
    print(f"Cosine Similarity between v1 and v2: {Vector.cosine_similarity(v1, v2)}")
    print(f"Euclidean Distance between v1 and v2: {Vector.euclidean_distance(v1, v2)}")
except ValueError as e:
    print(e)

Length of v1: 3.7416573867739413
Length of v2: 8.774964387392123
Cosine Similarity between v1 and v2: 0.9746318461970762
Euclidean Distance between v1 and v2: 5.196152422706632


In [13]:
"""
Question number 6:

Download the dataset and solve questions that follow. Follow any tutorial on ‘python data frame’.
https://github.com/fivethirtyeight/data/blob/master/us-weather-history/KCLT.csv

Load the csv file of the dataset into a dataframe in Python program.

(i) Find the mean, median, mode, min and max for all numeric attributes.
(ii) Print the top 20% of rows showing only the first four columns.
(iii) Create a new column (call it ‘newColumn’), it should have the same values as the column ‘actual_mean_temp’. Print head of dataframe.
(iv) Remove the new column that you have created above. Print head of dataframe.
(v) Print the first 10 rows, then remove the row containing data of ‘2014-7-3’, save this row in a variable of type series (data structure). Print the first 10 rows after removal of the row. 
(vi) Add the row that you deleted before. Print the first 10 rows again.
(vii) Update the actual_min_temp in the data row for date ‘2014-7-3’ to any value. Print the updated row.
(viii) Add 5 to the ‘actual_mean_temp’ wherever the ‘actual_min_temp’ is odd. Do this only on the top 10 rows. Print these 10 rows before and after the operation.
(ix) Print only those rows where the absolute difference between the ‘record_min_temp_year’ and ‘record_max_temp_year’ is less than 30.

"""

'\nQuestion number 6:\n'

In [6]:
import pandas as pd

df = pd.read_csv('./dataset.csv')

In [7]:
# Find the mean, median, mode, min and max for all numeric attributes.
mean_values = df.mean()
median_values = df.median()
mode_values = df.mode().iloc[0] 
min_values = df.min()
max_values = df.max()

  mean_values = df.mean()
  median_values = df.median()


In [8]:
print(mean_values)

actual_mean_temp           61.049315
actual_min_temp            49.958904
actual_max_temp            71.630137
average_min_temp           48.819178
average_max_temp           70.983562
record_min_temp            31.465753
record_max_temp            88.728767
record_min_temp_year     1953.279452
record_max_temp_year     1953.989041
actual_precipitation        0.102411
average_precipitation       0.114082
record_precipitation        2.208904
dtype: float64


In [9]:
print(median_values)

actual_mean_temp           63.00
actual_min_temp            52.00
actual_max_temp            73.00
average_min_temp           48.00
average_max_temp           72.00
record_min_temp            30.00
record_max_temp            90.00
record_min_temp_year     1963.00
record_max_temp_year     1953.00
actual_precipitation        0.00
average_precipitation       0.11
record_precipitation        1.98
dtype: float64


In [10]:
print(mode_values)

date                     2014-10-1
actual_mean_temp              78.0
actual_min_temp               67.0
actual_max_temp               84.0
average_min_temp              68.0
average_max_temp              89.0
record_min_temp               53.0
record_max_temp              100.0
record_min_temp_year        1967.0
record_max_temp_year        1954.0
actual_precipitation           0.0
average_precipitation         0.11
record_precipitation          1.65
Name: 0, dtype: object


In [11]:
print(min_values)

date                     2014-10-1
actual_mean_temp                18
actual_min_temp                  7
actual_max_temp                 26
average_min_temp                29
average_max_temp                50
record_min_temp                 -5
record_max_temp                 69
record_min_temp_year          1879
record_max_temp_year          1879
actual_precipitation           0.0
average_precipitation         0.09
record_precipitation          0.85
dtype: object


In [12]:
print(max_values)

date                     2015-6-9
actual_mean_temp               88
actual_min_temp                75
actual_max_temp               100
average_min_temp               68
average_max_temp               89
record_min_temp                62
record_max_temp               104
record_min_temp_year         2015
record_max_temp_year         2015
actual_precipitation         2.65
average_precipitation        0.15
record_precipitation         6.88
dtype: object


In [14]:
# Calculate the number of rows to display as top 20%
num_rows = int(len(df) * 0.2)

# Print the top 20% of rows showing only the first four columns
print(df.iloc[:num_rows, :4])


         date  actual_mean_temp  actual_min_temp  actual_max_temp
0    2014-7-1                81               70               91
1    2014-7-2                85               74               95
2    2014-7-3                82               71               93
3    2014-7-4                75               64               86
4    2014-7-5                72               60               84
..        ...               ...              ...              ...
68   2014-9-7                79               70               88
69   2014-9-8                70               67               73
70   2014-9-9                72               66               77
71  2014-9-10                72               65               79
72  2014-9-11                77               64               89

[73 rows x 4 columns]


In [15]:
# Add a new column with the same values as 'actual_mean_temp'
df['newColumn'] = df['actual_mean_temp']

# Print the head of the dataframe
print(df.head())


       date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0  2014-7-1                81               70               91   
1  2014-7-2                85               74               95   
2  2014-7-3                82               71               93   
3  2014-7-4                75               64               86   
4  2014-7-5                72               60               84   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0                67                89               56              104   
1                68                89               56              101   
2                68                89               56               99   
3                68                89               55               99   
4                68                89               57              100   

   record_min_temp_year  record_max_temp_year  actual_precipitation  \
0                  1919                  2012                  0.00   
1   

In [16]:
# Remove the 'newColumn'
df.drop(columns=['newColumn'], inplace=True)

# Print the head of the dataframe
print(df.head())


       date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0  2014-7-1                81               70               91   
1  2014-7-2                85               74               95   
2  2014-7-3                82               71               93   
3  2014-7-4                75               64               86   
4  2014-7-5                72               60               84   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0                67                89               56              104   
1                68                89               56              101   
2                68                89               56               99   
3                68                89               55               99   
4                68                89               57              100   

   record_min_temp_year  record_max_temp_year  actual_precipitation  \
0                  1919                  2012                  0.00   
1   

In [17]:
# Print the first 10 rows
print(df.head(10))

# Remove the row for '2014-7-3'
dropped_row = df[df['date'] == '2014-7-3']
df = df[df['date'] != '2014-7-3'].copy()

# Print the first 10 rows after removal
print(df.head(10))

# Save the dropped row in a variable of type Series
dropped_series = dropped_row.squeeze()


        date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0   2014-7-1                81               70               91   
1   2014-7-2                85               74               95   
2   2014-7-3                82               71               93   
3   2014-7-4                75               64               86   
4   2014-7-5                72               60               84   
5   2014-7-6                74               61               87   
6   2014-7-7                79               67               91   
7   2014-7-8                83               72               94   
8   2014-7-9                80               71               89   
9  2014-7-10                78               71               85   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0                67                89               56              104   
1                68                89               56              101   
2                68       

In [18]:
# Add the row back to the DataFrame
df = pd.concat([df, dropped_row])

# Sorting by date to maintain order after adding the row back
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)

# Print the first 10 rows after adding the row back
print(df.head(10))


        date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0 2014-07-01                81               70               91   
1 2014-07-02                85               74               95   
2 2014-07-03                82               71               93   
3 2014-07-04                75               64               86   
4 2014-07-05                72               60               84   
5 2014-07-06                74               61               87   
6 2014-07-07                79               67               91   
7 2014-07-08                83               72               94   
8 2014-07-09                80               71               89   
9 2014-07-10                78               71               85   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0                67                89               56              104   
1                68                89               56              101   
2                68       

In [22]:
# Update 'actual_min_temp' for '2014-7-3'
df.loc[df['date'] == '2014-7-3', 'actual_min_temp'] = 70

# Print the updated row
print(df[df['date'] == '2014-7-3'])


        date  actual_mean_temp  actual_min_temp  actual_max_temp  \
2 2014-07-03                82               70               93   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
2                68                89               56               99   

   record_min_temp_year  record_max_temp_year  actual_precipitation  \
2                  2010                  1931                  0.14   

   average_precipitation  record_precipitation  
2                   0.11                   2.5  


In [25]:
# Select top 10 rows
top_10_rows = df.head(10)

# Update 'actual_mean_temp' adding 5 where 'actual_min_temp' is odd
top_10_rows.loc[top_10_rows['actual_min_temp'] % 2 != 0, 'actual_mean_temp'] += 5

# Assign the updated top 10 rows back to the main dataframe
df.update(top_10_rows)

# Display the updated top 10 rows
print(df.head(10))

        date  actual_mean_temp  actual_min_temp  actual_max_temp  \
0 2014-07-01              81.0             70.0             91.0   
1 2014-07-02              85.0             74.0             95.0   
2 2014-07-03              82.0             70.0             93.0   
3 2014-07-04              75.0             64.0             86.0   
4 2014-07-05              72.0             60.0             84.0   
5 2014-07-06              79.0             61.0             87.0   
6 2014-07-07              84.0             67.0             91.0   
7 2014-07-08              83.0             72.0             94.0   
8 2014-07-09              85.0             71.0             89.0   
9 2014-07-10              83.0             71.0             85.0   

   average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
0              67.0              89.0             56.0            104.0   
1              68.0              89.0             56.0            101.0   
2              68.0       

In [23]:
# Print the rows with absolute difference less than 30
diff_less_than_30 = df[abs(df['record_min_temp_year'] - df['record_max_temp_year']) < 30]
print(diff_less_than_30)


          date  actual_mean_temp  actual_min_temp  actual_max_temp  \
3   2014-07-04                75               64               86   
4   2014-07-05                72               60               84   
5   2014-07-06                74               61               87   
6   2014-07-07                79               67               91   
10  2014-07-11                78               68               87   
..         ...               ...              ...              ...   
354 2015-06-20                83               71               95   
356 2015-06-22                83               65              100   
359 2015-06-25                86               74               98   
360 2015-06-26                85               70              100   
362 2015-06-28                76               66               85   

     average_min_temp  average_max_temp  record_min_temp  record_max_temp  \
3                  68                89               55               99   
4    