# Data Mining - Lab - 2

#  Numpy  &  Perform Data Exploration with Pandas

-------------------------------------------------------------------------------
## Numpy

1) NumPy (Numerical Python) is a powerful open-source library in Python used for numerical and scientific computing.<br>
2) It provides support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on them efficiently.<br>
3) NumPy is highly optimized and written in C, making it much faster than using regular Python lists for numerical operations.<br>
4) It serves as the foundation for many other Python libraries in data science and machine learning, like pandas, TensorFlow, and scikit-learn.<br>
5) With features like broadcasting, vectorization, and integration with C/C++ code, NumPy allows for cleaner and faster code in numerical computations.<br>



### Step 1. Import the Numpy library

In [1]:
import numpy as np



### Step 2. Create a 1D array of numbers

In [2]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [3]:
arr = np.arange(15, 25)
arr

array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24])

In [7]:
arr2 = np.array([1, 2, 3, 4, 5])
arr

array([1, 2, 3, 4, 5])

In [5]:
print(type(arr))

<class 'numpy.ndarray'>


In [6]:
arr.dtype # type of elements in array

dtype('int32')

### Step 3. Reshape 1D to 2D Array

In [8]:
arr3 = np.arange(20).reshape(4, 5)
arr3

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [10]:
arr4 = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]])
arr4

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

### Step 4. Create a Linspace array

In [11]:
# By default 50 elements will be generated. start and end are inclusive
np.linspace(14, 18)

array([14.        , 14.08163265, 14.16326531, 14.24489796, 14.32653061,
       14.40816327, 14.48979592, 14.57142857, 14.65306122, 14.73469388,
       14.81632653, 14.89795918, 14.97959184, 15.06122449, 15.14285714,
       15.2244898 , 15.30612245, 15.3877551 , 15.46938776, 15.55102041,
       15.63265306, 15.71428571, 15.79591837, 15.87755102, 15.95918367,
       16.04081633, 16.12244898, 16.20408163, 16.28571429, 16.36734694,
       16.44897959, 16.53061224, 16.6122449 , 16.69387755, 16.7755102 ,
       16.85714286, 16.93877551, 17.02040816, 17.10204082, 17.18367347,
       17.26530612, 17.34693878, 17.42857143, 17.51020408, 17.59183673,
       17.67346939, 17.75510204, 17.83673469, 17.91836735, 18.        ])

In [12]:
# we can also specify the size i.e number of elements
np.linspace(14, 18, 25)

array([14.        , 14.16666667, 14.33333333, 14.5       , 14.66666667,
       14.83333333, 15.        , 15.16666667, 15.33333333, 15.5       ,
       15.66666667, 15.83333333, 16.        , 16.16666667, 16.33333333,
       16.5       , 16.66666667, 16.83333333, 17.        , 17.16666667,
       17.33333333, 17.5       , 17.66666667, 17.83333333, 18.        ])

### Step 5. Create a Random Numbered Array

In [13]:
np.random.rand(8)

array([0.58935271, 0.5974579 , 0.27830196, 0.82634043, 0.85005746,
       0.70173039, 0.91777925, 0.1084021 ])

In [14]:
# we can also specify dimensions
np.random.rand(5, 5)

array([[0.46977335, 0.73609512, 0.65496958, 0.6849521 , 0.64675749],
       [0.21720368, 0.41312382, 0.27105669, 0.55481504, 0.26517938],
       [0.43863297, 0.4817021 , 0.48314882, 0.78598686, 0.70949816],
       [0.88820939, 0.17479755, 0.49767424, 0.68392447, 0.13548013],
       [0.11920043, 0.78737906, 0.59360354, 0.46909847, 0.57276611]])

### Step 6. Create a Random Integer Array

In [15]:
#np.random.randint(20, 40, 10)
np.random.randint(20, 40, size=10)

array([21, 32, 22, 23, 24, 30, 29, 26, 29, 38])

In [16]:
np.random.randint(20, 40, size=(5, 5))

array([[31, 20, 22, 31, 36],
       [26, 31, 20, 30, 35],
       [22, 39, 39, 33, 34],
       [30, 27, 24, 28, 20],
       [23, 34, 37, 32, 26]])

### Step 7. Create a 1D Array and get Max,Min,ArgMax,ArgMin

In [17]:
arr = np.random.randint(1, 100, 10)
arr

array([61, 81, 53, 55, 31, 97, 60, 48, 57, 66])

In [19]:
arr.max()

97

In [20]:
arr.min()

31

In [24]:
arr.argmax() # first index of max element

5

In [25]:
arr.argmin() # first index of min element

4

### Step 8. Indexing in 1D Array

In [26]:
arr = np.random.randint(1, 100, 10)
arr

array([96, 27, 82, 56, 70, 33, 16, 86,  9, 28])

In [27]:
arr[4]

70

In [28]:
arr[2:7]

array([82, 56, 70, 33, 16])

### Step 9. Indexing in 2D Array

In [29]:
arr = np.random.randint(1, 100, (5, 5))
arr

array([[ 4, 50, 52, 68, 45],
       [12, 46,  8, 58, 45],
       [41, 82, 11, 90, 41],
       [28, 19, 85, 25, 56],
       [93, 45, 73, 39,  2]])

In [30]:
# access 3rd row
arr[2]

array([41, 82, 11, 90, 41])

In [31]:
# access 2nd, 3rd, 4th row
arr[1:4]

array([[12, 46,  8, 58, 45],
       [41, 82, 11, 90, 41],
       [28, 19, 85, 25, 56]])

In [32]:
# access any single element
arr[0, 2]

52

In [33]:
arr[0][2]

52

### Step 10. Conditional Selection

In [34]:
arr = np.random.randint(20, 40, 10)
arr

array([23, 34, 23, 23, 23, 36, 29, 21, 31, 39])

In [36]:
# elements greater than 25
arr[arr > 25]

array([34, 36, 29, 31, 39])

In [41]:
arr[(arr >= 25) & (arr <= 35)]

array([34, 29, 31])

### 🔥You did it! 10 exercises down — you're on fire! 🔥

## Pandas



### Step 1. Import the necessary libraries

In [42]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [52]:
users = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user", sep="|", index_col="user_id")

In [53]:
users

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


### Step 4. See the first 25 entries

In [54]:
users.head(25)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


### Step 5. See the last 10 entries

In [55]:
users.tail(10)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
934,61,M,engineer,22902
935,42,M,doctor,66221
936,24,M,other,32789
937,48,M,educator,98072
938,38,F,technician,55038
939,26,F,student,33319
940,32,M,administrator,2215
941,20,M,student,97229
942,48,F,librarian,78209
943,22,M,student,77841


### Step 6. What is the number of observations in the dataset?

In [66]:
users.shape[0]

943

### Step 7. What is the number of columns in the dataset?

In [67]:
users.shape[1]

4

### Step 8. Print the name of all the columns.

In [65]:
users.columns

Index(['age', 'gender', 'occupation', 'zip_code'], dtype='object')

### Step 9. How is the dataset indexed?

In [70]:
users.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            934, 935, 936, 937, 938, 939, 940, 941, 942, 943],
           dtype='int64', name='user_id', length=943)

### Step 10. What is the data type of each column?

In [71]:
users.dtypes

age            int64
gender        object
occupation    object
zip_code      object
dtype: object

### Step 11. Print only the occupation column

In [72]:
users['occupation']

user_id
1         technician
2              other
3             writer
4         technician
5              other
           ...      
939          student
940    administrator
941          student
942        librarian
943          student
Name: occupation, Length: 943, dtype: object

In [76]:
users.occupation

user_id
1         technician
2              other
3             writer
4         technician
5              other
           ...      
939          student
940    administrator
941          student
942        librarian
943          student
Name: occupation, Length: 943, dtype: object

### Step 12. How many different occupations are in this dataset?

In [79]:
users['occupation'].unique()

array(['technician', 'other', 'writer', 'executive', 'administrator',
       'student', 'lawyer', 'educator', 'scientist', 'entertainment',
       'programmer', 'librarian', 'homemaker', 'artist', 'engineer',
       'marketing', 'none', 'healthcare', 'retired', 'salesman', 'doctor'],
      dtype=object)

In [78]:
users['occupation'].nunique()

21

### Step 13. What is the most frequent occupation?

In [80]:
users['occupation'].value_counts()

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
lawyer            12
salesman          12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

In [81]:
users['occupation'].value_counts().head(1)

student    196
Name: occupation, dtype: int64

In [82]:
# Top 5 most frequent occupation
users['occupation'].value_counts().head(5)

student          196
other            105
educator          95
administrator     79
engineer          67
Name: occupation, dtype: int64

In [83]:
# name of most frequent occupation
users['occupation'].value_counts().idxmax()

'student'

### Step 14. Summarize the DataFrame.

In [84]:
# only numeric columns
users.describe()

Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


### Step 15. Summarize all the columns

In [85]:
users.describe(include="all")

Unnamed: 0,age,gender,occupation,zip_code
count,943.0,943,943,943.0
unique,,2,21,795.0
top,,M,student,55414.0
freq,,670,196,9.0
mean,34.051962,,,
std,12.19274,,,
min,7.0,,,
25%,25.0,,,
50%,31.0,,,
75%,43.0,,,


### Step 16. Summarize only the occupation column

In [86]:
users['occupation'].describe()

count         943
unique         21
top       student
freq          196
Name: occupation, dtype: object

### Step 17. What is the mean age of users?

In [87]:
users['age'].mean()

34.05196182396607

### Step 18. What is the age with least occurrence?

In [88]:
users['age'].value_counts().tail()

7     1
66    1
11    1
10    1
73    1
Name: age, dtype: int64

In [90]:
# index of first min element
users['age'].value_counts().idxmin()

7

### You're not just learning, you're mastering it. Keep aiming higher! 🚀